Update internal ovxlib to rel/1.2.14 (#699)

Type:  New Feature

Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
Chen Feiyue 2024-07-08 09:29:24 +08:00 committed by GitHub
parent 8894360c74
commit c8b7c410bf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
94 changed files with 14958 additions and 320 deletions

View File

@ -1 +1 @@
1.2.6
1.2.14

View File

@ -199,3 +199,7 @@ DEF_OP(CROP_AND_RESIZE)
DEF_OP(TAN)
DEF_OP(RMSNORM)
DEF_OP(SHAPE)
DEF_OP(BITCAST)
DEF_OP(GROUPED_CONV3D)
DEF_OP(COL2IM)
DEF_OP(L1_LAYER_NORM)

View File

@ -0,0 +1,44 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_BITCAST_H
#define _VSI_NN_OP_BITCAST_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_bitcast_param
{
struct _bitcast_local_data_t* local;
} vsi_nn_bitcast_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,49 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_COL2IM_H
#define _VSI_NN_OP_COL2IM_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_col2im_param
{
const int32_t* image_shape;
const int32_t* block_shape;
int32_t strides[3];
int32_t pads[6];
int32_t dilations[3];
int32_t dim_num;
} vsi_nn_col2im_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,55 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_GROUPED_CONV3D_H
#define _VSI_NN_OP_GROUPED_CONV3D_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_grouped_conv3d_param
{
void* local;
uint32_t ksize[3];
uint32_t stride[3];
/* Pad left, right, top, bottom, front, rear */
uint32_t pad[6];
/* Pad type default value shall be AUTO */
vsi_nn_pad_e pad_type;
uint32_t weights;
uint32_t group;
uint32_t dilation[3];
int32_t multiplier;
vsi_nn_pad_mode_e pad_mode;
} vsi_nn_grouped_conv3d_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,47 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_L1_LAYER_NORM_H
#define _VSI_NN_OP_L1_LAYER_NORM_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_l1_layer_norm_param
{
struct _l1_layer_norm_local_data_t * local;
float eps;
int32_t axis;
} vsi_nn_l1_layer_norm_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -349,7 +349,7 @@ vsi_bool vsi_nn_IsEVISFeatureAvaiable
vsi_nn_context_t context
);
int32_t vsi_nn_compareVersion
OVXLIB_API int32_t vsi_nn_compareVersion
(
vsi_nn_graph_t * graph,
uint32_t version_major,

File diff suppressed because it is too large Load Diff

View File

@ -26,6 +26,7 @@
#define _VSI_NN_CONTEXT_H
#include "vsi_nn_platform.h"
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
@ -75,12 +76,19 @@ typedef struct _vsi_nn_runtime_option_t
int32_t enable_shader;
int32_t enable_opcheck;
int32_t enable_concat_optimize;
int32_t enable_asymi8_to_u8;
/* 0: disable convert int8 to uint8
* 1: enable convert asymm int8 to asymm uint8
* 2: enable convert both asymm and sym int8 to asymm uint8
*/
int32_t enable_i8_to_u8;
int32_t enable_dataconvert_optimize;
int32_t enable_stream_processor;
int32_t enable_rgb88_planar_nhwc;
int32_t enable_slice_optimize;
int32_t enable_batch_opt;
int32_t enable_save_file_type;
int32_t enable_use_image_process;
int32_t enable_use_from_handle;
} vsi_nn_runtime_option_t;
/**
@ -101,6 +109,10 @@ typedef struct _vsi_nn_context_t
OVXLIB_API vsi_nn_context_t vsi_nn_CreateContext
( void );
OVXLIB_API vsi_status vsi_nn_initOptions
(
vsi_nn_runtime_option_t *options
);
/**
* Release context
* Release ovxlib NN runtime resource and reset context handle to NULL.

View File

@ -53,5 +53,9 @@
#if defined(VX_13_NN_COMPATIBLITY)
#define VSI_MAP_TENSOR_PATCH_SUPPORT
#endif
#if defined (VX_QUANT_PER_GROUP_SUPPORT)
#define VSI_PER_GROUP_QUANTIZATION_SUPPORT
#endif
#define VSI_GRAPH_RUNTIME_ENV_SUPPORT
#endif

View File

@ -814,11 +814,77 @@ OVXLIB_API vsi_status vsi_nn_ExecuteGraphLoop
vsi_nn_tensor_t *max_iteration_tensor
);
OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption
/**
* Set runtime variable
* Set runtime variable for ovxlib and driver.
*
* @param[in] graph Graph handle
* @param[in] key Ovxlib and driver Envoriment variable name
* Ovxlib supported keys:
* VSI_NN_ENABLE_I8TOU8
* VSI_NN_ENABLE_OPCHECK
* VSI_SAVE_FILE_TYPE
* VSI_USE_IMAGE_PROCESS
* VSI_NN_ENABLE_CONCAT_OPTIMIZE
* VSI_NN_ENABLE_DATACONVERT_OPTIMIZE
* VSI_VX_ENABLE_STREAM_PROCESSOR
* VSI_NN_FORCE_RGB888_OUT_NHWC
* VSI_NN_ENABLE_SLICE_OPTIMIZE
* VSI_VX_ENABLE_BATCH_OPT
* VSI_USE_FROM_HANDLE
* Driver keys:
* VIV_VX_ENABLE_GRAPH_TRANSFORM
* VIV_VX_ENABLE_SHADER
* In addition to the ovxlib keys listed above, all others will be treated as the driver envoriment variable.
* @return VSI_SUCCESS on success, or appropriate error code otherwise
*/
OVXLIB_API vsi_status vsi_nn_SetRunTimeVariable
(
vsi_nn_graph_t* graph,
const char* ctrl_str,
size_t size
const char* key,
const char* value
);
/**
* Get runtime variable
* Get runtime variable of ovxlib.
*
* @param[in] graph Graph handle
* @param[in] key Envoriment variable name
* Supported keys:
* VSI_NN_ENABLE_I8TOU8
* VSI_NN_ENABLE_OPCHECK
* VSI_SAVE_FILE_TYPE
* VSI_USE_IMAGE_PROCESS
* VSI_NN_ENABLE_CONCAT_OPTIMIZE
* VSI_NN_ENABLE_DATACONVERT_OPTIMIZE
* VSI_VX_ENABLE_STREAM_PROCESSOR
* VSI_NN_FORCE_RGB888_OUT_NHWC
* VSI_NN_ENABLE_SLICE_OPTIMIZE
* VSI_VX_ENABLE_BATCH_OPT
* VSI_USE_FROM_HANDLE
* VIV_VX_ENABLE_GRAPH_TRANSFORM
* VIV_VX_ENABLE_SHADER
* Only supported the keys listed above.
* @return Variable's value on success, or NULL otherwise, attention: if success,
* the caller need release the memory after use the return value.
*/
OVXLIB_API char* vsi_nn_GetRunTimeVariable
(
const vsi_nn_graph_t* graph,
const char* key
);
int32_t vsi_nn_GetVariable(const char* variableKey);
OVXLIB_API char* vsi_nn_GenerateGraphJson
(
vsi_nn_graph_t* graph
);
OVXLIB_API vsi_status vsi_nn_ReleaseGraphJson
(
char* json
);
/**

View File

@ -212,6 +212,10 @@
#include "ops/vsi_nn_op_crop_and_resize.h"
#include "ops/vsi_nn_op_rmsnorm.h"
#include "ops/vsi_nn_op_shape.h"
#include "ops/vsi_nn_op_bitcast.h"
#include "ops/vsi_nn_op_grouped_conv3d.h"
#include "ops/vsi_nn_op_col2im.h"
#include "ops/vsi_nn_op_l1_layer_norm.h"
/* custom node head define define */
#include "custom/vsi_nn_custom_node_type.h"
#include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -412,6 +416,10 @@ typedef union _vsi_nn_nn_param
vsi_nn_crop_and_resize_param crop_and_resize;
vsi_nn_rmsnorm_param rmsnorm;
vsi_nn_shape_param shape;
vsi_nn_bitcast_param bitcast;
vsi_nn_grouped_conv3d_param grouped_conv3d;
vsi_nn_col2im_param col2im;
vsi_nn_l1_layer_norm_param l1_layer_norm;
void* client_param;
/* custom node data struct define */

View File

@ -86,6 +86,8 @@ typedef enum
VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
/** perchannel float8 */
VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
/** GPQT */
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
/** undefined type */
VSI_NN_QNT_TYPE_NA = 0xff,
} vsi_nn_qnt_type_e;
@ -126,6 +128,16 @@ typedef struct vsi_nn_dtype
const int32_t * zero_points;
int32_t zero_points_dim;
};
#endif
#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
/** Meanful in GPTQ_SYMMETRIC */
struct {
const float* group_scales;
int32_t group_channel_dim;
int32_t group_size;
const int32_t* group_zero_points;
int32_t group_count;
};
#endif
};
};

View File

@ -33,7 +33,7 @@ extern "C"{
#define VSI_NN_VERSION_MAJOR 1
#define VSI_NN_VERSION_MINOR 2
#define VSI_NN_VERSION_PATCH 5
#define VSI_NN_VERSION_PATCH 14
#define VSI_NN_VERSION \
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

View File

@ -35,6 +35,8 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (!VX_ARGMAX_VX_SUPPORT)
__BEGIN_DECLS
@ -289,3 +291,5 @@ OnError:
__END_DECLS
REGISTER_BACKEND_CL( argmax, _setup )
#endif

View File

@ -0,0 +1,432 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
#define _COL2IM_KERNEL_SOURCE_NAME "col2im"
// Add kernel hashtable here
#define COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d) \
(( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 | (_image_2d)))
#define COL2IM_KERNELS( IN_DTYPE, OUT_DTYPE ) \
{ COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE , 0), \
CVIVANTE_NAMESPACE("cl.col2im_"#IN_DTYPE"to"#OUT_DTYPE), \
_COL2IM_KERNEL_SOURCE_NAME }
#define COL2IM_KERNELS_2D( IN_DTYPE, OUT_DTYPE ) \
{ COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE , 1), \
CVIVANTE_NAMESPACE("cl.col2im_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \
_COL2IM_KERNEL_SOURCE_NAME }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _col2im_kernel_map[] =
{
// Register kernel here
COL2IM_KERNELS( F32, F32 ),
COL2IM_KERNELS( F32, U32 ),
COL2IM_KERNELS( F32, I32 ),
COL2IM_KERNELS( U32, U32 ),
COL2IM_KERNELS( U32, F32 ),
COL2IM_KERNELS( U32, I32 ),
COL2IM_KERNELS( I32, I32 ),
COL2IM_KERNELS( I32, U32 ),
COL2IM_KERNELS( I32, F32 ),
COL2IM_KERNELS_2D( F32, F32 ),
COL2IM_KERNELS_2D( F32, U32 ),
COL2IM_KERNELS_2D( F32, I32 ),
COL2IM_KERNELS_2D( U32, U32 ),
COL2IM_KERNELS_2D( U32, F32 ),
COL2IM_KERNELS_2D( U32, I32 ),
COL2IM_KERNELS_2D( I32, I32 ),
COL2IM_KERNELS_2D( I32, U32 ),
COL2IM_KERNELS_2D( I32, F32 ),
};
/*
* Kernel params
*/
static vx_param_description_t _col2im_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _COL2IM_PARAM_NUM _cnt_of_array( _col2im_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_col2im_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0} // globalWorkSize: image size in thread
};
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * in_shape = NULL;
int32_t stride_w = 1, stride_h = 1;
int32_t dilation_w = 1, dilation_h = 1, dilation_d = 1;
int32_t pad_w_front = 0, pad_w_end = 0, pad_h_front = 0, pad_h_end = 0, pad_d_front = 0, pad_d_end = 0;
int32_t kernel_w = 1, kernel_h = 1, kernel_d = 1;
int32_t move_time_x = 0;
int32_t move_time_y = 0;
int32_t width_pad = 0;
int32_t height_pad = 0;
int32_t depth_pad = 0;
int32_t kernel_x_new = 1;
int32_t kernel_y_new = 1;
int32_t kernel_z_new = 1;
int32_t batch = 1;
int32_t width = 1;
int32_t height = 1;
int32_t depth = 1;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &stride_w);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &stride_h);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &dilation_w);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &dilation_h);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &dilation_d);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &pad_w_front);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &pad_w_end);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &pad_h_front);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &pad_h_end);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_d_front);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_d_end);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &kernel_w);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &kernel_h);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[16], &kernel_d);
CHECK_STATUS_FAIL_GOTO(status, final );
batch = (int32_t)(attr[0]->shape->data[2]);
width = (int32_t)(attr[1]->shape->data[0]);
height = (int32_t)(attr[1]->shape->data[1]);
depth = (int32_t)(attr[1]->shape->data[2]) / batch;
width_pad = width + pad_w_front + pad_w_end;
height_pad = height + pad_h_front + pad_h_end;
depth_pad = depth + pad_d_front + pad_d_end;
move_time_x = (width_pad - ((kernel_w - 1) * dilation_w + 1) + stride_w) / stride_w;
move_time_y = (height_pad - ((kernel_h - 1) * dilation_h + 1) + stride_h) / stride_h;
kernel_x_new = (kernel_w - 1) * dilation_w + 1;
kernel_y_new = (kernel_h - 1) * dilation_h + 1;
kernel_z_new = (kernel_d - 1) * dilation_d + 1;
status = vsi_nn_kernel_gpu_add_param( node, "width_pad", &width_pad );
status |= vsi_nn_kernel_gpu_add_param( node, "height_pad", &height_pad );
status |= vsi_nn_kernel_gpu_add_param( node, "depth_pad", &depth_pad );
status |= vsi_nn_kernel_gpu_add_param( node, "move_time_x", &move_time_x );
status |= vsi_nn_kernel_gpu_add_param( node, "move_time_y", &move_time_y );
status |= vsi_nn_kernel_gpu_add_param( node, "kernel_x_new", &kernel_x_new );
status |= vsi_nn_kernel_gpu_add_param( node, "kernel_y_new", &kernel_y_new );
status |= vsi_nn_kernel_gpu_add_param( node, "kernel_z_new", &kernel_z_new );
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth );
CHECK_STATUS_FAIL_GOTO(status, final );
in_shape = attr[1]->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = in_shape->data[0];
gpu_param.global_size[1] = in_shape->data[1];
gpu_param.global_size[2] = in_shape->data[2];
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
}
return status;
} /* _col2im_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_bool image_2d
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _col2im_kernel_map;
size_t kernel_map_size = _cnt_of_array( _col2im_kernel_map );
vx_param_description_t * param_def = _col2im_kernel_param_def;
vx_kernel_initialize_f initializer = _col2im_initializer;
uint32_t key;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (F16 == in_dtype)
{
in_dtype = F32;
}
else if (U8 == in_dtype)
{
in_dtype = U32;
}
else if (I8 == in_dtype || I16 == in_dtype)
{
in_dtype = I32;
}
if (F16 == out_dtype)
{
out_dtype = F32;
}
else if (U8 == out_dtype)
{
out_dtype = U32;
}
else if (I8 == out_dtype || I16 == out_dtype)
{
out_dtype = I32;
}
key = COL2IM_HASH_KEY( in_dtype, out_dtype ,image_2d);
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _col2im_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_COL2IM_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
vsi_bool image_2d = FALSE;
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
float inputZp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float outputZp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float inOutScale = inputScale / outputScale;
float inOutTile = outputZp - inOutScale * inputZp;
int32_t stride_w = vsi_nn_kernel_param_get_int32( params, "stride_w" );
int32_t stride_h = vsi_nn_kernel_param_get_int32( params, "stride_h" );
int32_t stride_d = vsi_nn_kernel_param_get_int32( params, "stride_d" );
int32_t dilation_w = vsi_nn_kernel_param_get_int32( params, "dilation_w" );
int32_t dilation_h = vsi_nn_kernel_param_get_int32( params, "dilation_h" );
int32_t dilation_d = vsi_nn_kernel_param_get_int32( params, "dilation_d" );
int32_t pad_w_front = vsi_nn_kernel_param_get_int32( params, "pad_w_front" );
int32_t pad_w_end = vsi_nn_kernel_param_get_int32( params, "pad_w_end" );
int32_t pad_h_front = vsi_nn_kernel_param_get_int32( params, "pad_h_front" );
int32_t pad_h_end = vsi_nn_kernel_param_get_int32( params, "pad_h_end" );
int32_t pad_d_front = vsi_nn_kernel_param_get_int32( params, "pad_d_front" );
int32_t pad_d_end = vsi_nn_kernel_param_get_int32( params, "pad_d_end" );
size_t dim_num = 0;
int32_t* block_shape = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "block_shape", &dim_num);
int32_t kernel_w = block_shape[0];
int32_t kernel_h = dim_num > 1 ? block_shape[1] : 1;
int32_t kernel_d = dim_num > 2 ? block_shape[2] : 1;
VSI_UNREFERENCED(params);
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
image_2d = dim_num > 2 ? FALSE : TRUE;
shapes[0][0] = inputs[0]->attr.size[0];
shapes[0][1] = inputs[0]->attr.size[1] / outputs[0]->attr.size[dim_num];
shapes[0][2] = inputs[0]->attr.size[2] * outputs[0]->attr.size[dim_num];
shapes[1][0] = outputs[0]->attr.size[0];
shapes[1][1] = outputs[0]->attr.size[1];
if (image_2d)
{
shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
}
else
{
shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3] * outputs[0]->attr.size[4];
}
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
if (rs_input == NULL || rs_output == NULL)
{
goto final;
}
status = _query_kernel( kernel, inputs, outputs, image_2d );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
node_params[0] = rs_input;
node_params[1] = rs_output;
node_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &stride_w );
node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &stride_h );
node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &stride_d );
node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_w );
node_params[6] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_h );
node_params[7] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_d );
node_params[8] = vsi_nn_kernel_scalar_create( graph, I32, &pad_w_front );
node_params[9] = vsi_nn_kernel_scalar_create( graph, I32, &pad_w_end );
node_params[10] = vsi_nn_kernel_scalar_create( graph, I32, &pad_h_front );
node_params[11] = vsi_nn_kernel_scalar_create( graph, I32, &pad_h_end );
node_params[12] = vsi_nn_kernel_scalar_create( graph, I32, &pad_d_front );
node_params[13] = vsi_nn_kernel_scalar_create( graph, I32, &pad_d_end );
node_params[14] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_w );
node_params[15] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_h );
node_params[16] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_d );
node_params[17] = vsi_nn_kernel_scalar_create( graph, F32, &inOutScale );
node_params[18] = vsi_nn_kernel_scalar_create( graph, F32, &inOutTile );
status = vsi_nn_kernel_node_pass_param( node, node_params, _COL2IM_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &node_params[2] );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
vsi_nn_kernel_scalar_release( &node_params[10] );
vsi_nn_kernel_scalar_release( &node_params[11] );
vsi_nn_kernel_scalar_release( &node_params[12] );
vsi_nn_kernel_scalar_release( &node_params[13] );
vsi_nn_kernel_scalar_release( &node_params[14] );
vsi_nn_kernel_scalar_release( &node_params[15] );
vsi_nn_kernel_scalar_release( &node_params[16] );
vsi_nn_kernel_scalar_release( &node_params[17] );
}
}
final:
if (rs_input)
{
vsi_nn_kernel_tensor_release( &rs_input );
}
if (rs_output)
{
vsi_nn_kernel_tensor_release( &rs_output );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( col2im, _setup )

View File

@ -46,21 +46,36 @@ __BEGIN_DECLS
#define KERNEL_SOURCE_1 "cumsum"
#define KERNEL_SOURCE_2 "cumsum_2d"
#define KERNEL_SOURCE_3 "cumsum_array_axis0"
#define KERNEL_SOURCE_4 "cumsum_array_axis1"
#define KERNEL_SOURCE_5 "cumsum_array_axis2"
#define KERNEL_SOURCE_6 "cumsum_array_2d_axis0"
#define KERNEL_SOURCE_7 "cumsum_array_2d_axis1"
// Add kernel hashtable here
#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d, is_array) \
((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 8) | (_image_2d << 4) | (is_array))
#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \
CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
KERNEL_SOURCE_1 },
#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \
CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
KERNEL_SOURCE_2 },
#define HASH_CUMSUM_ARRAY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \
CVIVANTE_NAMESPACE("cl.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
SOURCE },
#define HASH_CUMSUM_ARRAY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 1), \
CVIVANTE_NAMESPACE("cl.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
@ -82,6 +97,22 @@ static const struct {
HASH_CUMSUM_KERNELS_2D(1, U8, U8)
HASH_CUMSUM_KERNELS_2D(1, F32, F32)
HASH_CUMSUM_KERNELS_2D(1, F32, U8)
HASH_CUMSUM_ARRAY_KERNELS(0, U8, U8, KERNEL_SOURCE_3)
HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)
HASH_CUMSUM_ARRAY_KERNELS(0, F32, U8, KERNEL_SOURCE_3)
HASH_CUMSUM_ARRAY_KERNELS(1, U8, U8, KERNEL_SOURCE_4)
HASH_CUMSUM_ARRAY_KERNELS(1, F32, F32, KERNEL_SOURCE_4)
HASH_CUMSUM_ARRAY_KERNELS(1, F32, U8, KERNEL_SOURCE_4)
HASH_CUMSUM_ARRAY_KERNELS(2, U8, U8, KERNEL_SOURCE_5)
HASH_CUMSUM_ARRAY_KERNELS(2, F32, F32, KERNEL_SOURCE_5)
HASH_CUMSUM_ARRAY_KERNELS(2, F32, U8, KERNEL_SOURCE_5)
HASH_CUMSUM_ARRAY_KERNELS_2D(0, U8, U8, KERNEL_SOURCE_6)
HASH_CUMSUM_ARRAY_KERNELS_2D(0, F32, F32, KERNEL_SOURCE_6)
HASH_CUMSUM_ARRAY_KERNELS_2D(0, F32, U8, KERNEL_SOURCE_6)
HASH_CUMSUM_ARRAY_KERNELS_2D(1, U8, U8, KERNEL_SOURCE_7)
HASH_CUMSUM_ARRAY_KERNELS_2D(1, F32, F32, KERNEL_SOURCE_7)
HASH_CUMSUM_ARRAY_KERNELS_2D(1, F32, U8, KERNEL_SOURCE_7)
};
/*
@ -197,7 +228,8 @@ static vsi_status _query_kernel
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t axis,
int32_t is_2d
int32_t is_2d,
int32_t is_array
/* Add extra params */
)
{
@ -230,7 +262,7 @@ static vsi_status _query_kernel
output_dtype = F32;
}
key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d, is_array);
for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
{
@ -270,6 +302,7 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_t * kernel
)
{
#define VSI_NN_MAX_BLOCK_SIZE GPU_TENSOR_MAX_WIDTH
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CUMSUM_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
@ -291,6 +324,7 @@ static vsi_nn_kernel_node_t _setup
int32_t height = 0;
int32_t channel = 1;
uint32_t i = 0;
int32_t is_array = 0;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
@ -326,13 +360,16 @@ static vsi_nn_kernel_node_t _setup
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[0], (vsi_size_t)rs_dim );
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
for (i = 0; i < rs_dim; i++)
{
return NULL;
if (shapes[0][i] > VSI_NN_MAX_BLOCK_SIZE)
{
is_array = 1;
}
}
#undef VSI_NN_MAX_BLOCK_SIZE
status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d );
status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d, is_array);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_GATHER_API_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -36,7 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#if !(VX_TENSOR_GATHER_API_SUPPORT)
__BEGIN_DECLS
/*

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -36,7 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
__BEGIN_DECLS
/*

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_LOGSOFTMAX_VX_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -36,7 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#if !(VX_LOGSOFTMAX_VX_SUPPORT)
__BEGIN_DECLS

View File

@ -36,6 +36,8 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (!VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
__BEGIN_DECLS
/*
@ -412,3 +414,4 @@ __END_DECLS
REGISTER_BACKEND_CL( nearest_grid_sample, _setup )
#endif

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_POW_API_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -36,7 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#if !(VX_TENSOR_POW_API_SUPPORT)
__BEGIN_DECLS
/*

View File

@ -36,7 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
#if (!VX_RESIZE_BILINEAR_SH_SUPPORT)
__BEGIN_DECLS
#define _RESIZE_BILINEAR_KERNEL_SOURCE() "resize_bilinear"
@ -319,3 +319,4 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CL( resize_bilinear, _setup )
#endif

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_TILE_API_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -36,7 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#if !(VX_TENSOR_TILE_API_SUPPORT)
__BEGIN_DECLS

View File

@ -34,20 +34,24 @@
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
#define _TOPK_KERNEL_SOURCE "topk"
#define STR(a) #a
// Add kernel hashtable here
#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26))
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \
CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
_TOPK_KERNEL_SOURCE }
#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \
CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
"topk2" }
#define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
#define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
@ -79,6 +83,7 @@ static const _kernel_map_type _topk_kernel_map[] =
PACK_KERNEL_MAP( F32, F32, 4 ),
PACK_KERNEL_MAP( F32, F32, 5 ),
PACK_KERNEL_MAP( F32, F32, 6 ),
PACK_KERNEL_MAP( F32, F32, 9 ),
PACK_KERNEL_MAP( U32, U32, 0 ),
PACK_KERNEL_MAP( U32, U32, 1 ),
@ -87,6 +92,7 @@ static const _kernel_map_type _topk_kernel_map[] =
PACK_KERNEL_MAP( U32, U32, 4 ),
PACK_KERNEL_MAP( U32, U32, 5 ),
PACK_KERNEL_MAP( U32, U32, 6 ),
PACK_KERNEL_MAP( U32, U32, 9 ),
PACK_KERNEL_MAP( I32, I32, 0 ),
PACK_KERNEL_MAP( I32, I32, 1 ),
@ -95,6 +101,7 @@ static const _kernel_map_type _topk_kernel_map[] =
PACK_KERNEL_MAP( I32, I32, 4 ),
PACK_KERNEL_MAP( I32, I32, 5 ),
PACK_KERNEL_MAP( I32, I32, 6 ),
PACK_KERNEL_MAP( I32, I32, 9 ),
PACK_KERNEL_MAP( F32, U32, 0 ),
PACK_KERNEL_MAP( F32, U32, 1 ),
@ -103,6 +110,7 @@ static const _kernel_map_type _topk_kernel_map[] =
PACK_KERNEL_MAP( F32, U32, 4 ),
PACK_KERNEL_MAP( F32, U32, 5 ),
PACK_KERNEL_MAP( F32, U32, 6 ),
PACK_KERNEL_MAP( F32, U32, 9 ),
PACK_KERNEL_MAP( F32, I32, 0 ),
PACK_KERNEL_MAP( F32, I32, 1 ),
@ -111,6 +119,10 @@ static const _kernel_map_type _topk_kernel_map[] =
PACK_KERNEL_MAP( F32, I32, 4 ),
PACK_KERNEL_MAP( F32, I32, 5 ),
PACK_KERNEL_MAP( F32, I32, 6 ),
PACK_KERNEL_MAP( F32, I32, 9 ),
PACK_MERGE_KERNEL_MAP(U32, U32),
PACK_MERGE_KERNEL_MAP(I32, I32),
};
static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@ -254,7 +266,8 @@ static vsi_status _query_kernel
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t num_stages
int32_t num_stages,
vsi_bool is_bitnoic_segment
)
{
vsi_status status = VSI_FAILURE;
@ -272,21 +285,23 @@ static vsi_status _query_kernel
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
num_stages = is_bitnoic_segment ? 0 : num_stages;
switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
{
case _PACK_SELECT_KEY(F32, F32):
case _PACK_SELECT_KEY(F16, F16):
key = TOPK_HASH_KEY( F32, F32, num_stages );
key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment );
break;
case _PACK_SELECT_KEY(U32, U32):
case _PACK_SELECT_KEY(U16, U16):
case _PACK_SELECT_KEY(U8, U8):
key = TOPK_HASH_KEY( U32, U32, num_stages );
key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment );
break;
case _PACK_SELECT_KEY(I32, I32):
case _PACK_SELECT_KEY(I16, I16):
case _PACK_SELECT_KEY(I8, I8):
key = TOPK_HASH_KEY( I32, I32, num_stages );
key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment );
break;
case _PACK_SELECT_KEY(F32, U32):
case _PACK_SELECT_KEY(F16, U32):
@ -294,7 +309,7 @@ static vsi_status _query_kernel
case _PACK_SELECT_KEY(F16, U16):
case _PACK_SELECT_KEY(F32, U8):
case _PACK_SELECT_KEY(F16, U8):
key = TOPK_HASH_KEY( F32, U32, num_stages );
key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment );
break;
case _PACK_SELECT_KEY(F32, I32):
case _PACK_SELECT_KEY(F16, I32):
@ -302,7 +317,7 @@ static vsi_status _query_kernel
case _PACK_SELECT_KEY(F16, I16):
case _PACK_SELECT_KEY(F32, I8):
case _PACK_SELECT_KEY(F16, I8):
key = TOPK_HASH_KEY( F32, I32, num_stages );
key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment );
break;
default:
break;
@ -440,7 +455,12 @@ static vsi_nn_kernel_node_t _setup
int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
vsi_bool is_odd_even_sort = FALSE;
vsi_bool is_bitnoic_segment = FALSE;
size_t param_num = _TOPK_PARAM_NUM;
int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
@ -471,9 +491,22 @@ static vsi_nn_kernel_node_t _setup
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shape[0], 2 );
if (num_stages < 7)
is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) &&
type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32);
if (is_bitnoic_segment && num_stages == 9)
{
status = _query_kernel( kernel, inputs, outputs, num_stages );
is_bitnoic_segment = FALSE;
}
else
{
num_stages = is_bitnoic_segment ? 9 : num_stages;
max_stages = is_bitnoic_segment ? max_stages : 7;
}
if (num_stages < max_stages || is_bitnoic_segment)
{
status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment );
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shape[1], 2 );

View File

@ -35,6 +35,8 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (!VX_ARGMAX_VX_SUPPORT)
__BEGIN_DECLS
#define HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
@ -510,3 +512,4 @@ __END_DECLS
REGISTER_BACKEND_EVIS( argmax, _setup )
#endif

View File

@ -51,26 +51,49 @@ __BEGIN_DECLS
#define KERNEL_SOURCE_5 "cumsum_ex_rev_axis0"
#define KERNEL_SOURCE_6 "cumsum_ex_rev_axis1"
#define KERNEL_SOURCE_7 "cumsum_ex_rev_axis2"
#define KERNEL_SOURCE_8 "cumsum_array"
#define KERNEL_SOURCE_9 "cumsum_array_2d"
#define KERNEL_SOURCE_10 "cumsum_array_bf16"
#define KERNEL_SOURCE_11 "cumsum_array_f16_u8"
#define KERNEL_SOURCE_12 "cumsum_array_ex_rev_axis0"
#define KERNEL_SOURCE_13 "cumsum_array_ex_rev_axis1"
#define KERNEL_SOURCE_14 "cumsum_array_ex_rev_axis2"
#define KERNEL_SOURCE_15 "cumsum_array_f16_u8_2d"
// Add kernel hashtable here
#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d) \
((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d, is_array) \
((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 8) | (_image_2d << 4) | (is_array))
#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0, 0), \
CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
SOURCE },
#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1, 0), \
CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
SOURCE },
#define HASH_CUMSUM_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0, 0), \
CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
SOURCE },
#define HASH_CUMSUM_ARRAY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0, 1), \
CVIVANTE_NAMESPACE("evis.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
SOURCE },
#define HASH_CUMSUM_ARRAY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1, 1), \
CVIVANTE_NAMESPACE("evis.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
SOURCE },
#define HASH_CUMSUM_ARRAY_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0, 1), \
CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
@ -135,6 +158,65 @@ static const struct {
HASH_CUMSUM_EX_REV_KERNELS(2, F16, U8, KERNEL_SOURCE_4)
HASH_CUMSUM_EX_REV_KERNELS(2, F16, I8, KERNEL_SOURCE_4)
HASH_CUMSUM_EX_REV_KERNELS(2, F16, I16, KERNEL_SOURCE_4)
HASH_CUMSUM_ARRAY_KERNELS(0, U8, U8, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(0, I8, I8, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(0, I16, I16, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(0, F16, F16, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(0, BF16, BF16, KERNEL_SOURCE_10)
HASH_CUMSUM_ARRAY_KERNELS(1, U8, U8, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(1, I8, I8, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(1, I16, I16, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(1, F16, F16, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(1, BF16, BF16, KERNEL_SOURCE_10)
HASH_CUMSUM_ARRAY_KERNELS(2, U8, U8, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(2, I8, I8, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(2, I16, I16, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(2, F16, F16, KERNEL_SOURCE_8)
HASH_CUMSUM_ARRAY_KERNELS(2, BF16, BF16, KERNEL_SOURCE_10)
HASH_CUMSUM_ARRAY_KERNELS_2D(0, U8, U8, KERNEL_SOURCE_9)
HASH_CUMSUM_ARRAY_KERNELS_2D(0, I8, I8, KERNEL_SOURCE_9)
HASH_CUMSUM_ARRAY_KERNELS_2D(0, I16, I16, KERNEL_SOURCE_9)
HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16, F16, KERNEL_SOURCE_9)
HASH_CUMSUM_ARRAY_KERNELS_2D(0, BF16, BF16, KERNEL_SOURCE_10)
HASH_CUMSUM_ARRAY_KERNELS_2D(1, U8, U8, KERNEL_SOURCE_9)
HASH_CUMSUM_ARRAY_KERNELS_2D(1, I8, I8, KERNEL_SOURCE_9)
HASH_CUMSUM_ARRAY_KERNELS_2D(1, I16, I16, KERNEL_SOURCE_9)
HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16, F16, KERNEL_SOURCE_9)
HASH_CUMSUM_ARRAY_KERNELS_2D(1, BF16, BF16, KERNEL_SOURCE_10)
HASH_CUMSUM_ARRAY_KERNELS(0, F16, U8, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_KERNELS(0, F16, I8, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_KERNELS(0, F16, I16, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_KERNELS(1, F16, U8, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_KERNELS(1, F16, I8, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_KERNELS(1, F16, I16, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_KERNELS(2, F16, U8, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_KERNELS(2, F16, I8, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_KERNELS(2, F16, I16, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16, U8, KERNEL_SOURCE_15)
HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16, I8, KERNEL_SOURCE_15)
HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16, I16, KERNEL_SOURCE_15)
HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16, U8, KERNEL_SOURCE_15)
HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16, I8, KERNEL_SOURCE_15)
HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16, I16, KERNEL_SOURCE_15)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, U8, U8, KERNEL_SOURCE_12)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, I8, I8, KERNEL_SOURCE_12)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, I16, I16, KERNEL_SOURCE_12)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, F16, F16, KERNEL_SOURCE_12)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, U8, U8, KERNEL_SOURCE_13)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, I8, I8, KERNEL_SOURCE_13)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, I16, I16, KERNEL_SOURCE_13)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16, F16, KERNEL_SOURCE_13)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, U8, U8, KERNEL_SOURCE_14)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, I8, I8, KERNEL_SOURCE_14)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, I16, I16, KERNEL_SOURCE_14)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16, F16, KERNEL_SOURCE_14)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16, U8, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16, I8, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16, I16, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16, U8, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16, I8, KERNEL_SOURCE_11)
HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16, I16, KERNEL_SOURCE_11)
};
/*
@ -161,6 +243,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
size_t param_size
)
{
#define VSI_NN_MAX_BLOCK_SIZE GPU_TENSOR_MAX_WIDTH
vsi_status status = VSI_FAILURE;
gpu_param_t shaderParam = {
3, // workdim
@ -188,6 +271,9 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
float in_out_zp_scale = 1.0f;
float in_out_scale = 1.0f;
int32_t is_array = 0;
int32_t remainder = 0;
uint32_t pack_key = 0;
VSI_UNREFERENCED(param_size);
@ -219,7 +305,15 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
height = (int32_t)(input_shape->data[1]);
channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);
if (width > VSI_NN_MAX_BLOCK_SIZE ||
height > VSI_NN_MAX_BLOCK_SIZE ||
channel > VSI_NN_MAX_BLOCK_SIZE)
{
is_array = 1;
}
#undef VSI_NN_MAX_BLOCK_SIZE
if (axis == 0)
{
w = 1;
@ -245,6 +339,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
{
shaderParam.global_scale[0] = 16;
}
remainder = w % shaderParam.global_scale[0];
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = (w + shaderParam.global_scale[0] - 1) / shaderParam.global_scale[0];
@ -253,6 +348,12 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
if (is_array)
{
status = vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
status |= vsi_nn_kernel_gpu_add_param(node, "w_size", &w);
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, AXIS, DIM) \
(IN0_TYPE | (OUT_TYPE << 8) | (AXIS << 16) | (DIM << 24))
@ -767,7 +868,8 @@ static vsi_status _query_kernel
const vsi_nn_kernel_param_t * params,
int32_t axis,
int32_t is_2d,
int32_t is_ex_rev
int32_t is_ex_rev,
int32_t is_array
)
{
vsi_status status = VSI_FAILURE;
@ -781,7 +883,7 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d);
key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d, is_array);
for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
{
@ -819,6 +921,7 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_t * kernel
)
{
#define VSI_NN_MAX_BLOCK_SIZE GPU_TENSOR_MAX_WIDTH
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_CUMSUM_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
@ -831,7 +934,10 @@ static vsi_nn_kernel_node_t _setup
int32_t is_2d = 0;
uint32_t rs_dim = 2;
uint32_t i = 0;
int32_t is_array = 0;
int32_t is_ex_or_rev = exclusive || reverse;
vsi_nn_kernel_dtype_e input0_dtype = U8;
int32_t width = 0;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
@ -860,7 +966,30 @@ static vsi_nn_kernel_node_t _setup
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[0], (vsi_size_t)rs_dim );
status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev);
width = (int32_t)shapes[0][0];
for (i = 0; i < rs_dim; i++)
{
if (shapes[0][i] > VSI_NN_MAX_BLOCK_SIZE)
{
is_array = 1;
}
}
#undef VSI_NN_MAX_BLOCK_SIZE
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
if (is_array &&
((axis_new == 0 && width < 8) ||
(axis_new > 0 && (((input0_dtype == U8 || input0_dtype == I8) && width < 16) ||
((input0_dtype != U8 && input0_dtype != I8) && width < 8)))
))
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev, is_array);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_GATHER_API_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -35,7 +35,7 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#if !(VX_TENSOR_GATHER_API_SUPPORT)
__BEGIN_DECLS
/*

View File

@ -58,14 +58,14 @@ __BEGIN_DECLS
_3D
} vsi_nn_kernel_coord_type_e;
#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _batch_dim) \
((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_batch_dim))
#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _batch_dim, is_array) \
((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_batch_dim << 4) | (is_array))
#define HASH_GATHER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
CVIVANTE_NAMESPACE("evis.gather_nd_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
#define TENSOR_GATHER_ND_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
{ HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0), \
{ HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0, 0), \
HASH_GATHER_ND_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
SOURCE },
@ -73,10 +73,26 @@ __BEGIN_DECLS
CVIVANTE_NAMESPACE("evis.gather_nd_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
#define TENSOR_GATHER_ND_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
{ HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1), \
{ HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1, 0), \
HASH_GATHER_ND_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
SOURCE },
#define HASH_GATHER_ND_ARRAY_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
CVIVANTE_NAMESPACE("evis.gather_nd_array_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
#define TENSOR_GATHER_ND_ARRAY_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
{ HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0, 1), \
HASH_GATHER_ND_ARRAY_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
SOURCE },
#define HASH_GATHER_ND_ARRAY_BATCH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
CVIVANTE_NAMESPACE("evis.gather_nd_array_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
#define TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
{ HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1, 1), \
HASH_GATHER_ND_ARRAY_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
@ -125,6 +141,50 @@ static const struct {
TENSOR_GATHER_ND_BATCH_KERNELS(U8, I32, U8, _2D, KERNEL_SOURCE_8)
TENSOR_GATHER_ND_BATCH_KERNELS(I16, I32, I16, _2D, KERNEL_SOURCE_8)
TENSOR_GATHER_ND_BATCH_KERNELS(F16, I32, F16, _2D, KERNEL_SOURCE_8)
TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, I8, _1D, KERNEL_SOURCE_1)
TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, U8, _1D, KERNEL_SOURCE_1)
TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _1D, KERNEL_SOURCE_1)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _1D, KERNEL_SOURCE_1)
TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, I8, _2D, KERNEL_SOURCE_2)
TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, U8, _2D, KERNEL_SOURCE_2)
TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _2D, KERNEL_SOURCE_2)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _2D, KERNEL_SOURCE_2)
TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, I8, _3D, KERNEL_SOURCE_3)
TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, U8, _3D, KERNEL_SOURCE_3)
TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _3D, KERNEL_SOURCE_3)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _3D, KERNEL_SOURCE_3)
TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, F16, _1D, KERNEL_SOURCE_4)
TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _1D, KERNEL_SOURCE_4)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8, _1D, KERNEL_SOURCE_4)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _1D, KERNEL_SOURCE_4)
TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, F16, _1D, KERNEL_SOURCE_4)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8, _1D, KERNEL_SOURCE_4)
TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, F16, _2D, KERNEL_SOURCE_5)
TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _2D, KERNEL_SOURCE_5)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8, _2D, KERNEL_SOURCE_5)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _2D, KERNEL_SOURCE_5)
TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, F16, _2D, KERNEL_SOURCE_5)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8, _2D, KERNEL_SOURCE_5)
TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, F16, _3D, KERNEL_SOURCE_6)
TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _3D, KERNEL_SOURCE_6)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8, _3D, KERNEL_SOURCE_6)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _3D, KERNEL_SOURCE_6)
TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, F16, _3D, KERNEL_SOURCE_6)
TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8, _3D, KERNEL_SOURCE_6)
TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I8, I32, I8, _1D, KERNEL_SOURCE_7)
TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(U8, I32, U8, _1D, KERNEL_SOURCE_7)
TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I16, I32, I16, _1D, KERNEL_SOURCE_7)
TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(F16, I32, F16, _1D, KERNEL_SOURCE_7)
TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I8, I32, I8, _2D, KERNEL_SOURCE_8)
TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(U8, I32, U8, _2D, KERNEL_SOURCE_8)
TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I16, I32, I16, _2D, KERNEL_SOURCE_8)
TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(F16, I32, F16, _2D, KERNEL_SOURCE_8)
};
/*
@ -148,7 +208,8 @@ static vsi_status get_gather_nd_tensor_reshape_size
vsi_size_t block_size,
uint32_t coordDim,
int32_t* newDim,
uint32_t batch_dims
uint32_t batch_dims,
int32_t* arrayFlg
)
{
vsi_status status = VSI_FAILURE;
@ -184,12 +245,20 @@ static vsi_status get_gather_nd_tensor_reshape_size
for (i = 0; i < coordDim - 1; i++)
{
sizes[rank++] = input_size[i + offset];
if (sizes[i] >= VSI_NN_MAX_IMAGE_WIDTH)
{
arrayFlg[0] = 1;
}
}
for (i = 0; i < batch_dims; i++)
{
sizes[rank] *= input_size[dims_num - i - 1];
}
if (sizes[rank] >= VSI_NN_MAX_IMAGE_WIDTH)
{
arrayFlg[0] = 1;
}
newDim[0] = rank + 1;
}
@ -198,6 +267,10 @@ static vsi_status get_gather_nd_tensor_reshape_size
for (i = coordDim-1; i > 0; i--)
{
sizes[i] = input_size[i + offset - 1];
if (sizes[i] >= VSI_NN_MAX_IMAGE_WIDTH)
{
arrayFlg[0] = 1;
}
}
for (i = 0; i < offset; i++)
{
@ -210,6 +283,10 @@ static vsi_status get_gather_nd_tensor_reshape_size
newDim[0] = 2;
sizes[0] = block_size;
sizes[1] = elementCnt / block_size;
if ((elementCnt / block_size) >= VSI_NN_MAX_IMAGE_WIDTH)
{
arrayFlg[0] = 1;
}
}
else if (coordDim == 4)
{
@ -242,6 +319,14 @@ static vsi_status get_gather_nd_tensor_reshape_size
status = VSI_SUCCESS;
newDim[0] = 3;
}
else
{
sizes[0] = block_size;
sizes[1] = elementCnt / block_size;
status = VSI_SUCCESS;
newDim[0] = 2;
arrayFlg[0] = 1;
}
}
#undef VSI_NN_MAX_IMAGE_WIDTH
@ -409,7 +494,8 @@ static vsi_status _query_kernel
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel,
int32_t coord_dim,
int32_t batch_dims
int32_t batch_dims,
int32_t is_array
)
{
vsi_status status = VSI_FAILURE;
@ -444,7 +530,7 @@ static vsi_status _query_kernel
coord_type = _3D;
}
key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg );
key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg, is_array);
for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
{
@ -482,6 +568,7 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_t * kernel
)
{
#define VSI_NN_MAX_BLOCK_SIZE GPU_TENSOR_MAX_WIDTH
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_GATHER_ND_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
@ -489,26 +576,41 @@ static vsi_nn_kernel_node_t _setup
int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
int32_t input_size = 1;
int32_t no_block_batch_size = 1;
int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
int32_t is_array = 0;
int32_t i = 0;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++)
{
input_size = input_size * (int32_t)inputs[0]->attr.size[i];
}
no_block_batch_size = input_size / block_size;
is_array = no_block_batch_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0],
block_size, coord_dim, &rs_in_dim, batch_dims, &is_array);
status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1],
coord_dim, 0, &rs_idx_dim, batch_dims, &is_array);
status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2],
block_size, 0, &rs_out_dim, batch_dims, &is_array);
#undef VSI_NN_MAX_BLOCK_SIZE
if (status != VSI_SUCCESS)
{
return NULL;
}
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
//if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
// outputs[0]->attr.dim_num ) )
//{
// return NULL;
//}
status = _query_kernel( inputs, outputs, kernel, coord_dim, batch_dims );
status = _query_kernel( inputs, outputs, kernel, coord_dim, batch_dims, is_array);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -36,7 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
__BEGIN_DECLS
#define SOURCE_AXIS0_0 "layer_normalization_0"

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_LOGSOFTMAX_VX_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -36,7 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#if !(VX_LOGSOFTMAX_VX_SUPPORT)
__BEGIN_DECLS
#define HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \

View File

@ -36,6 +36,8 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (!VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
__BEGIN_DECLS
/*
@ -625,3 +627,4 @@ __END_DECLS
REGISTER_BACKEND_EVIS( nearest_grid_sample, _setup )
#endif

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_POW_API_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -36,7 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#if !(VX_TENSOR_POW_API_SUPPORT)
__BEGIN_DECLS
#define KERNEL_SOURCE "pow",

View File

@ -750,6 +750,7 @@ static vsi_nn_kernel_node_t _setup
shape[2] = 1;
reshape_tensor = vsi_nn_reshape_tensor( graph,
outputs[0], shape, outputs[0]->attr.dim_num );
CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor fail.", final);
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
outputs[0]->attr.dim_num ) )
@ -819,6 +820,7 @@ static vsi_nn_kernel_node_t _setup
final:
vsi_nn_safe_free(node_params);
vsi_safe_release_tensor(reshape_tensor);
return node;
} /* _setup() */

View File

@ -911,6 +911,7 @@ static vsi_nn_kernel_node_t _setup
shape[2] = 1;
reshape_tensor = vsi_nn_reshape_tensor( graph,
outputs[0], shape, outputs[0]->attr.dim_num );
CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor fail.", final);
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
outputs[0]->attr.dim_num ) )
@ -978,6 +979,7 @@ static vsi_nn_kernel_node_t _setup
final:
vsi_nn_safe_free(node_params);
vsi_safe_release_tensor(reshape_tensor);
return node;
} /* _setup() */

View File

@ -36,7 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util_prv.h"
#if (!VX_RESIZE_BILINEAR_SH_SUPPORT)
__BEGIN_DECLS
/*
@ -1515,3 +1515,4 @@ final:
__END_DECLS
REGISTER_BACKEND_EVIS( resize_bilinear, _setup )
#endif

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_TILE_API_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -36,7 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#if !(VX_TENSOR_TILE_API_SUPPORT)
__BEGIN_DECLS
/*

View File

@ -29,6 +29,7 @@
#include "vsi_nn_context.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
@ -1673,7 +1674,7 @@ vsi_status vsi_nn_KernelGpuConfig
static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
{
int32_t enableShader = graph->ctx->options.enable_shader;
int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
if ( graph->ctx->config.subGroupSize == 0 )

View File

@ -181,6 +181,9 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(cos)
#if (VX_LOGSOFTMAX_VX_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
#endif
#if (VX_BITCAST_VX_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
#endif
__END_DECLS

View File

@ -916,11 +916,21 @@ vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node
{
input = in_tensor;
output = tensor;
/* Create a openvx tensor if it is not exist */
if (NULL == input->t)
{
vsi_nn_TensorReinit(graph, input);
}
}
else
{
input = tensor;
output = in_tensor;
/* Create a openvx tensor if it is not exist */
if (NULL == output->t)
{
vsi_nn_TensorReinit(graph, output);
}
}
vxTensorReshapeNode(graph->g, input->t, &reshape_param, sizeof(reshape_param), output->t);

View File

@ -0,0 +1,79 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (VX_ARGMAX_VX_SUPPORT)
#define REGISTER_ARGMAXOPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_ARGMAXOPENVX_KERNEL( argmax )
{
vx_node node = NULL;
int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
VSI_UNREFERENCED(kernel);
VSI_UNREFERENCED(params);
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
node = vxArgmaxLayer(graph->g,
inputs[0]->t,
axis,
outputs[0]->t
);
return (vsi_nn_kernel_node_t)node;
} /* argmax() */
#undef REGISTER_ARGMAXOPENVX_KERNEL
#endif

View File

@ -0,0 +1,77 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (VX_BITCAST_VX_SUPPORT)
#define REGISTER_BITCASTOPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_BITCASTOPENVX_KERNEL( bitcast )
{
vx_node node = NULL;
VSI_UNREFERENCED(kernel);
VSI_UNREFERENCED(params);
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
node = vxBitCastLayer(graph->g,
inputs[0]->t,
outputs[0]->t
);
return (vsi_nn_kernel_node_t)node;
} /* bitcast() */
#undef REGISTER_BITCASTOPENVX_KERNEL
#endif

View File

@ -0,0 +1,91 @@
/****************************************************************************
*
* Copyright (c) 2021 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vx_node node = NULL;
int32_t mode =
vsi_nn_kernel_param_get_int32(params, "mode");
int32_t align_corners =
vsi_nn_kernel_param_get_int32(params, "align_corners");
int32_t pad_mode =
vsi_nn_kernel_param_get_int32(params, "padding_mode");
VSI_UNREFERENCED(kernel);
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(input_num);
node = vxGridSampleLayer(
graph->g,
inputs[0]->t,
inputs[1]->t,
mode,
align_corners,
pad_mode,
outputs[0]->t
);
return (vsi_nn_kernel_node_t)node;
} /* _setup() */
#define REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL(KERNEL_NAME) \
static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num, \
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
) \
{ \
return _setup(graph, inputs, input_num, outputs, output_num, \
params, kernel); \
} \
REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL( nearest_grid_sample )
#undef REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL
#endif

View File

@ -0,0 +1,82 @@
/****************************************************************************
*
* Copyright (c) 2021 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (VX_L1_LAYER_NORM_VX_SUPPORT)
#define REGISTER_L1_LAYER_NORM_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_L1_LAYER_NORM_OPENVX_KERNEL( l1_layer_norm )
{
vx_node node = NULL;
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
VSI_UNREFERENCED(kernel);
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
node = vxL1LayerNormalizationLayer(
graph->g,
eps,
axis,
inputs[0]->t,
inputs[1]->t,
inputs[2]->t,
inputs[3]->t,
outputs[0]->t
);
return (vsi_nn_kernel_node_t)node;
} /* l1_layer_norm() */
#undef REGISTER_L1_LAYER_NORM_OPENVX_KERNEL
#endif

View File

@ -0,0 +1,162 @@
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
#include "cl_viv_vx_ext.h"
_viv_uniform int width_pad;
_viv_uniform int height_pad;
_viv_uniform int depth_pad;
_viv_uniform int move_time_x;
_viv_uniform int move_time_y;
_viv_uniform int kernel_x_new;
_viv_uniform int kernel_y_new;
_viv_uniform int kernel_z_new;
_viv_uniform int depth;
#define COL2IM(name, read_type, dst_type ,convert_type, write_type) \
__kernel void col2im_##name \
( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int stride_w, \
int stride_h, \
int stride_d, \
int dilation_w, \
int dilation_h, \
int dilation_d, \
int pad_w_front, \
int pad_w_end, \
int pad_h_front, \
int pad_h_end, \
int pad_d_front, \
int pad_d_end, \
int kernel_x, \
int kernel_y, \
int kernel_z, \
float inOutScale, \
float inOutTile \
) \
{ \
int x = get_global_id(0); \
int y = get_global_id(1); \
int z = get_global_id(2); \
int4 coord_out = (int4)(x,y,z,0); \
int b = z / depth; \
z = z % depth; \
int4 coord_in = (int4)(0,0,b,0); \
\
float sum = 0.0f; \
x = x + pad_w_front; \
y = y + pad_h_front; \
z = z + pad_d_front; \
int offset_x = x % stride_w; \
int offset_y = y % stride_h; \
int offset_z = z % stride_d; \
int i,j,k; \
for (k = offset_z; k < kernel_z_new; k += stride_d) \
{ \
if ((z - k) < 0 || (z + (kernel_z_new - k)) > depth_pad || k % dilation_d != 0) \
{ \
continue; \
} \
for (j = offset_y; j < kernel_y_new; j = j + stride_h) \
{ \
if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \
{ \
continue; \
} \
for (i = offset_x; i < kernel_x_new; i = i + stride_w) \
{ \
if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \
{ \
continue; \
} \
coord_in.x = (x - i + stride_w - 1) / stride_w + \
(y - j + stride_h - 1) / stride_h * move_time_x + \
(z - k + stride_d - 1) / stride_d * move_time_y * move_time_x; \
coord_in.y = i / dilation_w + j * kernel_x / dilation_h + k * kernel_x * kernel_y / dilation_d; \
sum = sum + convert_float(read_type(input, coord_in).x); \
} \
} \
} \
sum = sum * inOutScale + inOutTile; \
dst_type dst = 0; \
dst.x = convert_type(sum); \
write_type(output, coord_out, dst); \
}
COL2IM(U32toU32, read_imageui, uint4, convert_uint, write_imageui)
COL2IM(U32toI32, read_imageui, int4, convert_int, write_imagei)
COL2IM(U32toF32, read_imageui, float4, convert_float, write_imagef)
COL2IM(I32toU32, read_imagei, uint4, convert_uint, write_imageui)
COL2IM(I32toI32, read_imagei, int4, convert_int, write_imagei)
COL2IM(I32toF32, read_imagei, float4, convert_float, write_imagef)
COL2IM(F32toU32, read_imagef, uint4, convert_uint, write_imageui)
COL2IM(F32toI32, read_imagef, int4, convert_int, write_imagei)
COL2IM(F32toF32, read_imagef, float4, convert_float, write_imagef)
#define COL2IM_2D(name, read_type, dst_type ,convert_type, write_type) \
__kernel void col2im_##name##_2D \
( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int stride_w, \
int stride_h, \
int stride_d, \
int dilation_w, \
int dilation_h, \
int dilation_d, \
int pad_w_front, \
int pad_w_end, \
int pad_h_front, \
int pad_h_end, \
int pad_d_front, \
int pad_d_end, \
int kernel_x, \
int kernel_y, \
int kernel_z, \
float inOutScale, \
float inOutTile \
) \
{ \
int x = get_global_id(0); \
int y = get_global_id(1); \
int z = get_global_id(2); \
int4 coord_out = (int4)(x,y,z,0); \
int4 coord_in = (int4)(0,0,z,0); \
\
float sum = 0.0f; \
x = x + pad_w_front; \
y = y + pad_h_front; \
int offset_x = x % stride_w; \
int offset_y = y % stride_h; \
int i,j; \
for (j = offset_y; j < kernel_y_new; j = j + stride_h) \
{ \
if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \
{ \
continue; \
} \
for (i = offset_x; i < kernel_x_new; i = i + stride_w) \
{ \
if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \
{ \
continue; \
} \
coord_in.x = (x - i + stride_w - 1) / stride_w + \
(y - j + stride_h - 1) / stride_h * move_time_x; \
coord_in.y = i / dilation_w + j * kernel_x / dilation_h; \
sum = sum + convert_float(read_type(input, coord_in).x); \
} \
} \
sum = sum * inOutScale + inOutTile; \
dst_type dst = 0; \
dst.x = convert_type(sum); \
write_type(output, coord_out, dst); \
}
COL2IM_2D(U32toU32, read_imageui, uint4, convert_uint, write_imageui)
COL2IM_2D(U32toI32, read_imageui, int4, convert_int, write_imagei)
COL2IM_2D(U32toF32, read_imageui, float4, convert_float, write_imagef)
COL2IM_2D(I32toU32, read_imagei, uint4, convert_uint, write_imageui)
COL2IM_2D(I32toI32, read_imagei, int4, convert_int, write_imagei)
COL2IM_2D(I32toF32, read_imagei, float4, convert_float, write_imagef)
COL2IM_2D(F32toU32, read_imagef, uint4, convert_uint, write_imageui)
COL2IM_2D(F32toI32, read_imagef, int4, convert_int, write_imagei)
COL2IM_2D(F32toF32, read_imagef, float4, convert_float, write_imagef)

View File

@ -0,0 +1,332 @@
__kernel void cumsum_array_F32toF32_axis0_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
float sum = (float)(0);
Image img1 = create_image_from_image2d(input, 4);
Image img2 = create_image_from_image2d(output, 4);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
__global float* in_ptr = (__global float*)input_ptr;
__global float* out_ptr = (__global float*)output_ptr;
if(exclusive && rev)
{
coord.x = width - 1;
coord.z = coord.x;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
for(; coord.x > 0; coord.x--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
coord.z--;
sum += data;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else if(exclusive)
{
coord.z = 0;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
for(coord.x = 0; coord.x < width - 1; coord.x++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
coord.z++;
sum += data;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else if(rev)
{
for(coord.x = width - 1; coord.x >= 0; coord.x--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
sum += data;
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else
{
for(coord.x = 0; coord.x < width; coord.x++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
sum += data;
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
}
__kernel void cumsum_array_U8toU8_axis0_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
uint sum = (uint)(0);
uint dst = (uint)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0.0f;
Image img1 = create_image_from_image2d(input, 4);
Image img2 = create_image_from_image2d(output, 4);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
__global uint* in_ptr = (__global uint*)input_ptr;
__global uint* out_ptr = (__global uint*)output_ptr;
if(exclusive && rev)
{
coord.x = width - 1;
coord.z = coord.x;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = dst;
for(; coord.x > 0; coord.x--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global uint*)input_ptr;
uint data = in_ptr[0];
coord.z--;
cnt += 1.0;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else if(exclusive)
{
coord.z = 0;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = dst;
for(coord.x = 0; coord.x < width - 1; coord.x++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global uint*)input_ptr;
uint data = in_ptr[0];
cnt += 1.0f;
coord.z++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = dst;
}
}
else if(rev)
{
for(coord.x = width - 1; coord.x >= 0; coord.x--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global uint*)input_ptr;
uint data = in_ptr[0];
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = dst;
}
}
else
{
for(coord.x = 0; coord.x < width; coord.x++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global uint*)input_ptr;
uint data = in_ptr[0];
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = dst;
}
}
}
__kernel void cumsum_array_F32toU8_axis0_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
float4 sum = (float4)(0);
uint4 dst = (uint4)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0.0f;
Image img1 = create_image_from_image2d(input, 4);
Image img2 = create_image_from_image2d(output, 4);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
__global float* in_ptr = (__global float*)input_ptr;
__global uint* out_ptr = (__global uint*)output_ptr;
if(exclusive && rev)
{
coord.x = width - 1;
coord.z = coord.x;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
for(; coord.x > 0; coord.x--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
coord.z--;
cnt += 1.0;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
else if(exclusive)
{
coord.z = 0;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
for(coord.x = 0; coord.x < width - 1; coord.x++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
cnt += 1.0f;
coord.z++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
else if(rev)
{
for(coord.x = width - 1; coord.x >= 0; coord.x--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
else
{
for(coord.x = 0; coord.x < width; coord.x++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
}

View File

@ -0,0 +1,321 @@
__kernel void cumsum_array_F32toF32_axis1_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
float sum = (float)(0);
Image img1 = create_image_from_image2d(input, 4);
Image img2 = create_image_from_image2d(output, 4);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
__global float* in_ptr = (__global float*)input_ptr;
__global float* out_ptr = (__global float*)output_ptr;
if(exclusive && rev)
{
coord.w = height - 1;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
for(coord.y = height - 1; coord.y > 0; coord.y--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
coord.w--;
sum += data;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else if(exclusive)
{
write_imagef(output, coord.zw, sum);
for(coord.y = 0; coord.y < height - 1; coord.y++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
coord.w++;
sum += data;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else if(rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
sum += data;
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else
{
for(coord.y = 0; coord.y < height; coord.y++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
sum += data;
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
}
__kernel void cumsum_array_U8toU8_axis1_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
uint sum = (uint)(0);
uint dst = (uint)(0);
int tmp_zp = convert_int_rte(output_zp);
dst = convert_uint_sat(tmp_zp);
float cnt = 0;
Image img1 = create_image_from_image2d(input, 4);
Image img2 = create_image_from_image2d(output, 4);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
__global uint* in_ptr = (__global uint*)input_ptr;
__global uint* out_ptr = (__global uint*)output_ptr;
if(exclusive && rev)
{
coord.w = height - 1;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = dst;
for(coord.y = height - 1; coord.y > 0; coord.y--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global uint*)input_ptr;
uint data = in_ptr[0];
cnt += 1.0f;
coord.w--;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
else if(exclusive)
{
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = dst;
for(coord.y = 0; coord.y < height - 1; coord.y++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global uint*)input_ptr;
uint data = in_ptr[0];
cnt += 1.0f;
coord.w++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
else if(rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global uint*)input_ptr;
uint data = in_ptr[0];
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
else
{
for(coord.y = 0; coord.y < height; coord.y++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global uint*)input_ptr;
uint data = in_ptr[0];
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
}
__kernel void cumsum_array_F32toU8_axis1_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
float sum = (float)(0);
uint dst = (uint)(0);
int tmp_zp = convert_int_rte(output_zp);
dst = convert_uint_sat(tmp_zp);
float cnt = 0;
Image img1 = create_image_from_image2d(input, 4);
Image img2 = create_image_from_image2d(output, 4);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
__global float* in_ptr = (__global float*)input_ptr;
__global uint* out_ptr = (__global uint*)output_ptr;
if(exclusive && rev)
{
coord.w = height - 1;
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
for(coord.y = height - 1; coord.y > 0; coord.y--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
cnt += 1.0f;
coord.w--;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
else if(exclusive)
{
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
for(coord.y = 0; coord.y < height - 1; coord.y++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
cnt += 1.0f;
coord.w++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.zw);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
else if(rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
else
{
for(coord.y = 0; coord.y < height; coord.y++)
{
input_ptr = get_image_ptr_from_coord(img1, coord.xy);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum * in_out_scale + tmpAlpha;
dst = (uint)convert_int_rte(tmpSum);
output_ptr = get_image_ptr_from_coord(img2, coord.xy);
out_ptr = (__global uint*)output_ptr;
out_ptr[0] = dst;
}
}
}

View File

@ -0,0 +1,215 @@
__kernel void cumsum_array_F32toF32_axis0(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int channel,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
float sum = (float)(0);
Tensor img1 = create_tensor_from_image2d_array(input, 4);
Tensor img2 = create_tensor_from_image2d_array(output, 4);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global float* in_ptr = (__global float*)input_ptr;
__global float* out_ptr = (__global float*)output_ptr;
if(exclusive && rev)
{
coord_out.x = width - 1;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
for(coord.x = width - 1; coord.x > 0; coord.x--)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
coord_out.x--;
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else if(exclusive)
{
coord_out.x = 0;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
for(coord.x = 0; coord.x < width - 1; coord.x++)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
coord_out.x++;
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else if(rev)
{
for(coord.x = width - 1; coord.x >= 0; coord.x--)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else
{
for(coord.x = 0; coord.x < width; coord.x++)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
}
#define CUMSUM_ARRAY_toU8_AXIS0_SH(name, src_type) \
__kernel void cumsum_array_##name##toU8_axis0( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, \
int exclusive, \
int rev, \
int width, \
int height, \
int channel, \
int input_zp, \
float in_out_scale, \
float in_out_zp_scale, \
float output_zp \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
int4 coord_out = coord; \
\
src_type sum = (src_type)(0); \
uint dst = (uint)(0); \
int tmp_zp = convert_int_rte(output_zp); \
dst = convert_uint_sat(tmp_zp); \
\
float cnt = 0; \
\
Tensor img1 = create_tensor_from_image2d_array(input, 4); \
Tensor img2 = create_tensor_from_image2d_array(output, 4); \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global src_type* in_ptr = (__global src_type*)input_ptr; \
__global uint* out_ptr = (__global uint*)output_ptr; \
if(exclusive && rev) \
{ \
coord_out.x = width - 1; \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
for(coord.x = width - 1; coord.x > 0; coord.x--) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
coord_out.x--; \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
else if(exclusive) \
{ \
coord_out.x = 0; \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
for(coord.x = 0; coord.x < width - 1; coord.x++) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
coord_out.x++; \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
else if(rev) \
{ \
for(coord.x = width - 1; coord.x >= 0; coord.x--) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
else \
{ \
for(coord.x = 0; coord.x < width; coord.x++) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
}
CUMSUM_ARRAY_toU8_AXIS0_SH(U8,uint)
CUMSUM_ARRAY_toU8_AXIS0_SH(F32,float)

View File

@ -0,0 +1,216 @@
__kernel void cumsum_array_F32toF32_axis1(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int channel,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
float sum = (float)(0);
Tensor img1 = create_tensor_from_image2d_array(input, 4);
Tensor img2 = create_tensor_from_image2d_array(output, 4);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global float* in_ptr = (__global float*)input_ptr;
__global float* out_ptr = (__global float*)output_ptr;
if(exclusive && rev)
{
coord_out.y = height - 1;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
for(coord.y = height - 1; coord.y > 0; coord.y--)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
coord_out.y--;
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else if(exclusive)
{
coord_out.y = 0;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
for(coord.y = 0; coord.y < height - 1; coord.y++)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
coord_out.y++;
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else if(rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else
{
for(coord.y = 0; coord.y < height; coord.y++)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
}
#define CUMSUM_ARRAY_toU8_AXIS1_SH(name, src_type) \
__kernel void cumsum_array_##name##toU8_axis1( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, \
int exclusive, \
int rev, \
int width, \
int height, \
int channel, \
int input_zp, \
float in_out_scale, \
float in_out_zp_scale, \
float output_zp \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
int4 coord_out = coord; \
\
src_type sum = (src_type)(0); \
uint dst = (uint4)(0); \
int tmp_zp = convert_int_rte(output_zp); \
dst = convert_uint_sat(tmp_zp); \
\
float cnt = 0; \
\
Tensor img1 = create_tensor_from_image2d_array(input, 4); \
Tensor img2 = create_tensor_from_image2d_array(output, 4); \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global src_type* in_ptr = (__global src_type*)input_ptr; \
__global uint* out_ptr = (__global uint*)output_ptr; \
if(exclusive && rev) \
{ \
coord_out.y = height - 1; \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
\
for(coord.y = height - 1; coord.y > 0; coord.y--) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
cnt += 1.0f; \
coord_out.y--; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
else if(exclusive) \
{ \
coord_out.y = 0; \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
for(coord.y = 0; coord.y < height - 1; coord.y++) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
cnt += 1.0f; \
coord_out.y++; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
else if(rev) \
{ \
for(coord.y = height - 1; coord.y >= 0; coord.y--) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
else \
{ \
for(coord.y = 0; coord.y < height; coord.y++) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
}
CUMSUM_ARRAY_toU8_AXIS1_SH(U8,uint)
CUMSUM_ARRAY_toU8_AXIS1_SH(F32,float)

View File

@ -0,0 +1,215 @@
__kernel void cumsum_array_F32toF32_axis2(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int channel,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
float sum = 0;
Tensor img1 = create_tensor_from_image2d_array(input, 4);
Tensor img2 = create_tensor_from_image2d_array(output, 4);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global float* in_ptr = (__global float*)input_ptr;
__global float* out_ptr = (__global float*)output_ptr;
if(exclusive && rev)
{
coord_out.z = channel - 1;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
for(coord.z = channel - 1; coord.z > 0; coord.z--)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
coord_out.z--;
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else if(exclusive)
{
coord_out.z = 0;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
for(coord.z = 0; coord.z < channel - 1; coord.z++)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
coord_out.z++;
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else if(rev)
{
for(coord.z = channel - 1; coord.z >= 0; coord.z--)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
else
{
for(coord.z = 0; coord.z < channel; coord.z++)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global float*)input_ptr;
float data = in_ptr[0];
sum += data;
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global float*)output_ptr;
out_ptr[0] = sum;
}
}
}
#define CUMSUM_ARRAY_toU8_AXIS2_SH(name, src_type) \
__kernel void cumsum_array_##name##toU8_axis2( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, \
int exclusive, \
int rev, \
int width, \
int height, \
int channel, \
int input_zp, \
float in_out_scale, \
float in_out_zp_scale, \
float output_zp \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
int4 coord_out = coord; \
\
src_type sum = (src_type)(0); \
uint dst = (uint)(0); \
int tmp_zp = convert_int_rte(output_zp); \
dst = convert_uint_sat(tmp_zp); \
\
float cnt = 0.0f; \
Tensor img1 = create_tensor_from_image2d_array(input, 4); \
Tensor img2 = create_tensor_from_image2d_array(output, 4); \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global src_type* in_ptr = (__global src_type*)input_ptr; \
__global uint* out_ptr = (__global uint*)output_ptr; \
\
if(exclusive && rev) \
{ \
coord_out.z = channel - 1; \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
for(coord.z = channel - 1; coord.z > 0; coord.z--) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
coord_out.z--; \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
else if(exclusive) \
{ \
coord_out.z = 0; \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
for(coord.z = 0; coord.z < channel - 1; coord.z++) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
coord_out.z++; \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
else if(rev) \
{ \
for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
else \
{ \
for(coord.z = 0; coord.z < channel; coord.z++) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src_type data = in_ptr[0]; \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum * in_out_scale + tmpAlpha; \
\
dst = (uint)convert_int_rte(tmpSum); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global uint*)output_ptr; \
out_ptr[0] = dst; \
} \
} \
}
CUMSUM_ARRAY_toU8_AXIS2_SH(U8,uint)
CUMSUM_ARRAY_toU8_AXIS2_SH(F32,float)

View File

@ -18,8 +18,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
__local float local_data[128]; \
__local uint local_indices[128]; \
__local float local_data[LOCAL_SIZE0 * 2]; \
__local uint local_indices[LOCAL_SIZE0 * 2]; \
\
float left = read_imagef(input, coord.xy).x; \
coord.z += work_group_size; \
@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \
float right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -78,13 +78,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
write_imagei(indices, coord.xy, index.xxxx); \
write_imagei(indices, coord.zy, index.yyyy); \
}
TOPK_F32(1 << 0, 0)
TOPK_F32(1 << 1, 1)
TOPK_F32(1 << 2, 2)
TOPK_F32(1 << 3, 3)
TOPK_F32(1 << 4, 4)
TOPK_F32(1 << 5, 5)
TOPK_F32(1 << 6, 6)
TOPK_F32((1 << 0), 0)
TOPK_F32((1 << 1), 1)
TOPK_F32((1 << 2), 2)
TOPK_F32((1 << 3), 3)
TOPK_F32((1 << 4), 4)
TOPK_F32((1 << 5), 5)
TOPK_F32((1 << 6), 6)
TOPK_F32((1 << 9), 9)
#define TOPK_U32(LOCAL_SIZE0, STAGES) \
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \
@ -106,8 +107,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
__local uint local_data[128]; \
__local uint local_indices[128]; \
__local uint local_data[LOCAL_SIZE0 * 2]; \
__local uint local_indices[LOCAL_SIZE0 * 2]; \
\
uint left = read_imageui(input, coord.xy).x; \
coord.z += work_group_size; \
@ -139,7 +140,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
uint left_elem = local_data[left_id]; \
uint right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -166,13 +167,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
write_imagei(indices, coord.xy, index.xxxx); \
write_imagei(indices, coord.zy, index.yyyy); \
}
TOPK_U32(1 << 0, 0)
TOPK_U32(1 << 1, 1)
TOPK_U32(1 << 2, 2)
TOPK_U32(1 << 3, 3)
TOPK_U32(1 << 4, 4)
TOPK_U32(1 << 5, 5)
TOPK_U32(1 << 6, 6)
TOPK_U32((1 << 0), 0)
TOPK_U32((1 << 1), 1)
TOPK_U32((1 << 2), 2)
TOPK_U32((1 << 3), 3)
TOPK_U32((1 << 4), 4)
TOPK_U32((1 << 5), 5)
TOPK_U32((1 << 6), 6)
TOPK_U32((1 << 9), 9)
#define TOPK_I32(LOCAL_SIZE0, STAGES) \
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \
@ -194,8 +196,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
__local int local_data[128]; \
__local int local_indices[128]; \
__local int local_data[LOCAL_SIZE0 * 2]; \
__local int local_indices[LOCAL_SIZE0 * 2]; \
\
int left = read_imagei(input, coord.xy).x; \
coord.z += work_group_size; \
@ -227,7 +229,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
int left_elem = local_data[left_id]; \
int right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -254,13 +256,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
write_imagei(indices, coord.xy, index.xxxx); \
write_imagei(indices, coord.zy, index.yyyy); \
}
TOPK_I32(1 << 0, 0)
TOPK_I32(1 << 1, 1)
TOPK_I32(1 << 2, 2)
TOPK_I32(1 << 3, 3)
TOPK_I32(1 << 4, 4)
TOPK_I32(1 << 5, 5)
TOPK_I32(1 << 6, 6)
TOPK_I32((1 << 0), 0)
TOPK_I32((1 << 1), 1)
TOPK_I32((1 << 2), 2)
TOPK_I32((1 << 3), 3)
TOPK_I32((1 << 4), 4)
TOPK_I32((1 << 5), 5)
TOPK_I32((1 << 6), 6)
TOPK_I32((1 << 9), 9)
#define TOPK_F32toU32(LOCAL_SIZE0, STAGES) \
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toU32_I32 \
@ -282,8 +285,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
__local float local_data[128]; \
__local uint local_indices[128]; \
__local float local_data[LOCAL_SIZE0 * 2]; \
__local uint local_indices[LOCAL_SIZE0 * 2]; \
\
float left = read_imagef(input, coord.xy).x; \
coord.z += work_group_size; \
@ -315,7 +318,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \
float right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -342,13 +345,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
write_imagei(indices, coord.zy, index.yyyy); \
}
TOPK_F32toU32(1 << 0, 0)
TOPK_F32toU32(1 << 1, 1)
TOPK_F32toU32(1 << 2, 2)
TOPK_F32toU32(1 << 3, 3)
TOPK_F32toU32(1 << 4, 4)
TOPK_F32toU32(1 << 5, 5)
TOPK_F32toU32(1 << 6, 6)
TOPK_F32toU32((1 << 0), 0)
TOPK_F32toU32((1 << 1), 1)
TOPK_F32toU32((1 << 2), 2)
TOPK_F32toU32((1 << 3), 3)
TOPK_F32toU32((1 << 4), 4)
TOPK_F32toU32((1 << 5), 5)
TOPK_F32toU32((1 << 6), 6)
TOPK_F32toU32((1 << 9), 9)
#define TOPK_F32toI32(LOCAL_SIZE0, STAGES) \
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toI32_I32 \
@ -370,8 +374,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
__local float local_data[128]; \
__local uint local_indices[128]; \
__local float local_data[LOCAL_SIZE0 * 2]; \
__local uint local_indices[LOCAL_SIZE0 * 2]; \
\
float left = read_imagef(input, coord.xy).x; \
coord.z += work_group_size; \
@ -403,7 +407,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \
float right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -430,10 +434,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
write_imagei(indices, coord.zy, index.yyyy); \
}
TOPK_F32toI32(1 << 0, 0)
TOPK_F32toI32(1 << 1, 1)
TOPK_F32toI32(1 << 2, 2)
TOPK_F32toI32(1 << 3, 3)
TOPK_F32toI32(1 << 4, 4)
TOPK_F32toI32(1 << 5, 5)
TOPK_F32toI32(1 << 6, 6)
TOPK_F32toI32((1 << 0), 0)
TOPK_F32toI32((1 << 1), 1)
TOPK_F32toI32((1 << 2), 2)
TOPK_F32toI32((1 << 3), 3)
TOPK_F32toI32((1 << 4), 4)
TOPK_F32toI32((1 << 5), 5)
TOPK_F32toI32((1 << 6), 6)
TOPK_F32toI32((1 << 9), 9)

View File

@ -0,0 +1,368 @@
#define BITONIC_STEP(dtype) \
void bitonic_step_##dtype(uint num_stages, int lx, \
__local dtype *local_data, __local int *local_indices) \
{ \
for (uint stage = 0; stage < num_stages + 1; ++stage) \
{ \
uint signo = (lx >> stage) & 1; \
\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
uint postShift = (stage - passOfStage); \
uint pairDistance = 1 << postShift; \
\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
uint right_id = left_id + pairDistance; \
\
int left_idx = local_indices[left_id]; \
int right_idx = local_indices[right_id]; \
\
dtype left_elem = local_data[left_id]; \
dtype right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
\
local_indices[left_id] = right_idx; \
local_indices[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
} \
}
BITONIC_STEP(int)
BITONIC_STEP(uint)
#define BITONIC_STEP_ASCEND(dtype) \
void bitonic_step_ascend_##dtype(uint num_stages, int lx, \
__local dtype *p_share_k, __local int *p_share_v) \
{ \
for (uint stage = 0; stage < num_stages + 1; ++stage) \
{ \
uint signo = (lx >> stage) & 1; \
\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
uint postShift = (stage - passOfStage); \
uint pairDistance = 1 << postShift; \
\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
uint right_id = left_id + pairDistance; \
\
int left_idx = p_share_v[left_id]; \
int right_idx = p_share_v[right_id]; \
\
dtype left_elem = p_share_k[left_id]; \
dtype right_elem = p_share_k[right_id]; \
\
if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \
{ \
p_share_k[left_id] = right_elem; \
p_share_k[right_id] = left_elem; \
\
p_share_v[left_id] = right_idx; \
p_share_v[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
} \
}
BITONIC_STEP_ASCEND(int)
BITONIC_STEP_ASCEND(uint)
#define BITONIC_MERGE(dtype) \
void bitonic_merge_##dtype(uint num_stages, int lx, \
__local dtype *local_data, __local int *local_indices) \
{ \
uint stage = num_stages; \
uint signo = (lx >> stage) & 1; \
\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
uint postShift = (stage - passOfStage); \
uint pairDistance = 1 << postShift; \
\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
uint right_id = left_id + pairDistance; \
\
int left_idx = local_indices[left_id]; \
int right_idx = local_indices[right_id]; \
\
dtype left_elem = local_data[left_id]; \
dtype right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
\
local_indices[left_id] = right_idx; \
local_indices[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
}
BITONIC_MERGE(int)
BITONIC_MERGE(uint)
#define BLOCK_SIZE (512)
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32
(
__read_only image2d_t input,
__write_only image2d_t output,
__write_only image2d_t indices,
float input_scale,
float input_tail,
float output_scale,
float output_tail,
int _num_stages,
int width
)
{
uint lx = get_local_id(0);
const int init_k = -2147483647;
const int init_v = -2147483647;
const int num_stages = 9;
const int threads_per_block = BLOCK_SIZE;
const int index_minus_1 = threads_per_block * 2 - 1;
uint offset = 0;
uint lx1 = lx + threads_per_block;
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
__local int local_data[1536];
__local int local_indices[1536];
int left = read_imagei(input, coord.xy).x;
coord.z += threads_per_block;
int right = read_imagei(input, coord.zy).x;
local_data[lx] = left;
local_indices[lx] = coord.x;
local_data[lx1] = right;
local_indices[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_int(num_stages, lx, local_data, local_indices);
int min_data = local_data[511];
int *p_share_k = local_data + threads_per_block;
int *p_share_v = local_indices + threads_per_block;
int limit = (width >> 10) << 10;
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
barrier(CLK_LOCAL_MEM_FENCE);
for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
{
int2 data;
coord.z = coord.x + threads_per_block;
data.x = read_imagei(input, coord.xy).x;
data.y = read_imagei(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = data.y;
p_share_v[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] < min_data)
{
continue;
}
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_int(num_stages, lx, local_data, local_indices);
min_data = local_data[511];
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
}
if (width > limit)
{
if (coord.x < width)
{
int2 data;
data.x = read_imagei(input, coord.xy).x;
coord.z = coord.x + threads_per_block;
data.y = read_imagei(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = coord.z < width ? data.y : init_k;
p_share_v[lx1] = coord.z < width ? coord.z : init_v;
}
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] >= min_data)
{
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_int(num_stages, lx, local_data, local_indices);
}
}
int4 dst;
dst.x = local_data[lx];
coord.x = lx;
write_imagei(output, coord.xy, dst.xxxx);
int4 index;
index.x = local_indices[lx];
write_imagei(indices, coord.xy, index.xxxx);
}
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32
(
__read_only image2d_t input,
__write_only image2d_t output,
__write_only image2d_t indices,
float input_scale,
float input_tail,
float output_scale,
float output_tail,
int _num_stages,
int width
)
{
uint lx = get_local_id(0);
const uint init_k = 0;
const int init_v = -2147483647;
const int num_stages = 9;
const int threads_per_block = BLOCK_SIZE;
const int index_minus_1 = threads_per_block * 2 - 1;
uint offset = 0;
uint lx1 = lx + threads_per_block;
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
__local uint local_data[1536];
__local int local_indices[1536];
uint left = read_imageui(input, coord.xy).x;
coord.z += threads_per_block;
uint right = read_imageui(input, coord.zy).x;
local_data[lx] = left;
local_indices[lx] = coord.x;
local_data[lx1] = right;
local_indices[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_uint(num_stages, lx, local_data, local_indices);
uint min_data = local_data[511];
uint *p_share_k = local_data + threads_per_block;
int *p_share_v = local_indices + threads_per_block;
int limit = (width >> 10) << 10;
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
barrier(CLK_LOCAL_MEM_FENCE);
for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
{
uint2 data;
coord.z = coord.x + threads_per_block;
data.x = read_imageui(input, coord.xy).x;
data.y = read_imageui(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = data.y;
p_share_v[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] < min_data)
{
continue;
}
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_uint(num_stages, lx, local_data, local_indices);
min_data = local_data[511];
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
}
if (width > limit)
{
if (coord.x < width)
{
uint2 data;
data.x = read_imageui(input, coord.xy).x;
coord.z = coord.x + threads_per_block;
data.y = read_imageui(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = coord.z < width ? data.y : init_k;
p_share_v[lx1] = coord.z < width ? coord.z : init_v;
}
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] >= min_data)
{
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_uint(num_stages, lx, local_data, local_indices);
}
}
uint4 dst;
dst.x = local_data[lx];
coord.x = lx;
write_imageui(output, coord.xy, dst.xxxx);
int4 index;
index.x = local_indices[lx];
write_imagei(indices, coord.xy, index.xxxx);
}

View File

@ -0,0 +1,344 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
_viv_uniform int width;
_viv_uniform int height;
_viv_uniform int channel;
_viv_uniform int input_zp;
_viv_uniform float in_out_scale;
_viv_uniform float in_out_zp_scale;
_viv_uniform float output_zp;
_viv_uniform int remainder;
_viv_uniform int w_size;
__kernel void cumsum_array_F16toF16_axis2(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_short8 src, dst;
vxc_half8 data, sum;
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
for(coord.z = 0; coord.z < channel; coord.z++)
{
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
_viv_asm(COPY, data, src, 16);
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
#define CUMSUM_8BITS_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \
__kernel void cumsum_array_##in_name##to##out_name##_axis2( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, int exclusive, int rev \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
\
src_type src; \
dst_type dst; \
int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
\
Tensor img1 = create_tensor_from_image2d_array(input, 1); \
Tensor img2 = create_tensor_from_image2d_array(output, 1); \
if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
{ \
coord.x = coord.x - (16 - remainder); \
} \
for(coord.z = 0; coord.z < channel; coord.z++) \
{ \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global src_type* in_ptr = (__global src_type*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
int4 tmpDst2 = convert_int4_rte(tmpSum2); \
int4 tmpDst3 = convert_int4_rte(tmpSum3); \
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\
out_ptr[0] = dst; \
} \
}
CUMSUM_8BITS_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)
CUMSUM_8BITS_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)
__kernel void cumsum_array_I16toI16_axis2(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_short8 src, dst;
int4 sum0 = (int4)(0), sum1 = (int4)(0);
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
for(coord.z = 0; coord.z < channel; coord.z++)
{
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp;
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
int4 tmpDst0 = convert_int4_rte(tmpSum0);
int4 tmpDst1 = convert_int4_rte(tmpSum1);
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);
out_ptr[0] = dst;
}
}
__kernel void cumsum_array_F16toF16_axis1(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_short8 src, dst;
vxc_half8 data, sum;
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
for(coord.y = 0; coord.y < height; coord.y++)
{
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
_viv_asm(COPY, data, src, 16);
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
#define CUMSUM_8BITS_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \
__kernel void cumsum_array_##in_name##to##out_name##_axis1( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, int exclusive, int rev \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
\
src_type src; \
dst_type dst; \
int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
Tensor img1 = create_tensor_from_image2d_array(input, 2); \
Tensor img2 = create_tensor_from_image2d_array(output, 2); \
if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
{ \
coord.x = coord.x - (16 - remainder); \
} \
\
for(coord.y = 0; coord.y < height; coord.y++) \
{ \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global src_type* in_ptr = (__global src_type*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
int4 tmpDst2 = convert_int4_rte(tmpSum2); \
int4 tmpDst3 = convert_int4_rte(tmpSum3); \
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
out_ptr[0] = dst; \
} \
}
CUMSUM_8BITS_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)
CUMSUM_8BITS_ARRAY_AXIS1(I8, I8, vxc_char16, vxc_char16)
__kernel void cumsum_array_I16toI16_axis1(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_short8 src, dst;
int4 sum0 = (int4)(0), sum1 = (int4)(0);
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
for(coord.y = 0; coord.y < height; coord.y++)
{
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
int4 tmpDst0 = convert_int4_rte(tmpSum0);
int4 tmpDst1 = convert_int4_rte(tmpSum1);
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
out_ptr[0] = dst;
}
}
__kernel void cumsum_array_F16toF16_axis0(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_short8 src, dst;
vxc_half8 data, tmpsum, sum;
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
for(; coord.x < width; coord.x += 8)
{
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
_viv_asm(COPY, data, src, 16);
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);
VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);
VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
#define CUMSUM_ARRAY_QINT_AXIS0(in_name, out_name, src_type, dst_type) \
__kernel void cumsum_array_##in_name##to##out_name##_axis0( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, int exclusive, int rev \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
\
src_type src; \
dst_type dst; \
vxc_short8 rowSum; \
int4 sum0 = (int4)(0), sum1 = (int4)(0); \
short zp = (short)input_zp; \
\
for(; coord.x < width; coord.x += 8) \
{ \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global src_type* in_ptr = (__global src_type*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \
VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \
VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \
VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32A_4x4); \
VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32B_4x4); \
\
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
out_ptr[0] = dst; \
} \
}
CUMSUM_ARRAY_QINT_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16)
CUMSUM_ARRAY_QINT_AXIS0(I8, I8, vxc_char16, vxc_char16)
CUMSUM_ARRAY_QINT_AXIS0(I16, I16, vxc_short8, vxc_short8)

View File

@ -0,0 +1,259 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
_viv_uniform int width;
_viv_uniform int height;
_viv_uniform int input_zp;
_viv_uniform float in_out_scale;
_viv_uniform float in_out_zp_scale;
_viv_uniform float output_zp;
_viv_uniform int remainder;
_viv_uniform int w_size;
__kernel void cumsum_array_F16toF16_axis1_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis, int exclusive, int rev
)
{
int2 coord = (int2)(get_global_id(0), 0);
vxc_short8 src, dst;
vxc_half8 data, sum;
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
Image img1 = create_image_from_image2d(input, 2);
Image img2 = create_image_from_image2d(output, 2);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
for(; coord.y < height; coord.y++)
{
uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data, src, 16);
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
uniAccSumVertF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
#define CUMSUM_8BITS_ARRAY_AXIS1_2D(in_name, out_name, src_type, dst_type) \
__kernel void cumsum_array_##in_name##to##out_name##_axis1_2D( \
__read_only image2d_t input, \
__write_only image2d_t output, \
int axis, int exclusive, int rev \
) \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
\
src_type src; \
dst_type dst; \
int4 sum0 = (int4)(0); \
int4 sum1 = (int4)(0); \
int4 sum2 = (int4)(0); \
int4 sum3 = (int4)(0); \
\
Image img1 = create_image_from_image2d(input, 1); \
Image img2 = create_image_from_image2d(output, 1); \
if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
{ \
coord.x = coord.x - (16 - remainder); \
} \
for(coord.y = 0; coord.y < height; coord.y++) \
{ \
uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \
__global src_type* in_ptr = (__global src_type*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniAccSumVertU8toI32A_4x4); \
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniAccSumVertU8toI32B_4x4); \
VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniAccSumVertU8toI32C_4x4); \
VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniAccSumVertU8toI32D_4x4); \
\
float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
int4 tmpDst2 = convert_int4_rte(tmpSum2); \
int4 tmpDst3 = convert_int4_rte(tmpSum3); \
\
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8); \
VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8); \
out_ptr[0] = dst; \
} \
}
CUMSUM_8BITS_ARRAY_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16)
CUMSUM_8BITS_ARRAY_AXIS1_2D(I8, I8, vxc_char16, vxc_char16)
__kernel void cumsum_array_I16toI16_axis1_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis, int exclusive, int rev
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
vxc_short8 src, dst;
int4 sum0 = (int4)(0), sum1 = (int4)(0);
Image img1 = create_image_from_image2d(input, 2);
Image img2 = create_image_from_image2d(output, 2);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
for(coord.y = 0; coord.y < height; coord.y++)
{
uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniAccSumVertU8toI32A_4x4);
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniAccSumVertU8toI32B_4x4);
float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
int4 tmpDst0 = convert_int4_rte(tmpSum0);
int4 tmpDst1 = convert_int4_rte(tmpSum1);
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
uniConvertInt32toUint8_2x8);
out_ptr[0] = dst;
}
}
__kernel void cumsum_array_F16toF16_axis0_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis, int exclusive, int rev
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
vxc_short8 src, dst;
vxc_half8 data, tmpsum, sum;
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
Image img1 = create_image_from_image2d(input, 2);
Image img2 = create_image_from_image2d(output, 2);
for(; coord.x < width; coord.x += 8)
{
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
_viv_asm(COPY, data, src, 16);
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniSumHorzF16toF16A_4x4);
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
uniSumHorzF16toF16B_4x4);
VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
uniSumHorzF16toF16C_2x8);
VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
uniAccSumHorzF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
#define CUMSUM_ARRAY_QINT_AXIS0_2D(in_name, out_name, src_type, dst_type, stride_data) \
__kernel void cumsum_array_##in_name##to##out_name##_axis0_2D( \
__read_only image2d_t input, \
__write_only image2d_t output, \
int axis, int exclusive, int rev \
) \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
\
src_type src; \
dst_type dst; \
vxc_short8 rowSum; \
int4 sum0, sum1; \
sum0 ^= sum0; \
sum1 ^= sum1; \
short zp = (short)input_zp; \
Image img1 = create_image_from_image2d(input, stride_data); \
Image img2 = create_image_from_image2d(output, stride_data); \
\
for(; coord.x < width; coord.x += 8) \
{ \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \
__global src_type* in_ptr = (__global src_type*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniSumHorzU8toI16A_4x4); \
VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
uniSumHorzU8toI16B_8x4); \
VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
uniSubZpI16toI16_2x8); \
VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniAccSumHorzI16toI32A_4x4); \
VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniAccSumHorzI16toI32B_4x4); \
\
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
\
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8); \
out_ptr[0] = dst; \
} \
}
CUMSUM_ARRAY_QINT_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16, 1)
CUMSUM_ARRAY_QINT_AXIS0_2D(I8, I8, vxc_char16, vxc_char16, 1)
CUMSUM_ARRAY_QINT_AXIS0_2D(I16, I16, vxc_short8, vxc_short8, 2)

View File

@ -0,0 +1,244 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform int width;
_viv_uniform int height;
_viv_uniform int channel;
_viv_uniform int remainder;
_viv_uniform int w_size;
__kernel void cumsum_array_BF16toBF16_axis2(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_ushort8 src, val0, val1;
vxc_ushort8 dst0, dst1, dst;
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
float4 sum0 = (float4)(0), sum1 = (float4)(0);
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
for(coord.z = 0; coord.z < channel; coord.z++)
{
float4 data0, data1;
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
__global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
src = in_ptr[0];
VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, data0, val0, 16);
_viv_asm(COPY, data1, val1, 16);
sum0 += data0;
sum1 += data1;
_viv_asm(COPY, dst0, sum0, 16);
_viv_asm(COPY, dst1, sum1, 16);
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
out_ptr[0] = dst;
}
}
__kernel void cumsum_BF16toBF16_axis1(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_ushort8 src, val0, val1;
vxc_ushort8 dst0, dst1, dst;
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
float4 sum0 = (float4)(0), sum1 = (float4)(0);
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
for(coord.y = 0; coord.y < height; coord.y++)
{
float4 data0, data1;
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
__global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
src = in_ptr[0];
VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, data0, val0, 16);
_viv_asm(COPY, data1, val1, 16);
sum0 += data0;
sum1 += data1;
_viv_asm(COPY, dst0, sum0, 16);
_viv_asm(COPY, dst1, sum1, 16);
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
out_ptr[0] = dst;
}
}
__kernel void cumsum_BF16toBF16_axis0(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_ushort8 src, val0, val1;
vxc_ushort8 dst0, dst1, dst;
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
float preSum = 0;
float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
float4 q = (float4)(1.0, 1.0, 1.0, 0);
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
for(; coord.x < width; coord.x += 8)
{
float4 data0, data1;
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
__global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
src = in_ptr[0];
VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, data0, val0, 16);
_viv_asm(COPY, data1, val1, 16);
float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));
float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));
tmpSum1 += tmpSum0.w;
tmpSum0 += preSum;
tmpSum1 += preSum;
preSum = tmpSum1.w;
_viv_asm(COPY, dst0, tmpSum0, 16);
_viv_asm(COPY, dst1, tmpSum1, 16);
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
out_ptr[0] = dst;
}
}
__kernel void cumsum_BF16toBF16_axis1_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis, int exclusive, int rev
)
{
int2 coord = (int2)(get_global_id(0), 0);
vxc_ushort8 src, val0, val1;
vxc_ushort8 dst0, dst1, dst;
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
float4 sum0 = (float4)(0), sum1 = (float4)(0);
Image img1 = create_image_from_image2d(input, 2);
Image img2 = create_image_from_image2d(output, 2);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
for(; coord.y < height; coord.y++)
{
float4 data0, data1;
uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
__global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
__global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
src = in_ptr[0];
VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, data0, val0, 16);
_viv_asm(COPY, data1, val1, 16);
sum0 += data0;
sum1 += data1;
_viv_asm(COPY, dst0, sum0, 16);
_viv_asm(COPY, dst1, sum1, 16);
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniExtractOddData_2x8);
out_ptr[0] = dst;
}
}
__kernel void cumsum_BF16toBF16_axis0_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis, int exclusive, int rev
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
vxc_ushort8 src, val0, val1;
vxc_ushort8 dst0, dst1, dst;
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
float preSum = 0;
float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
float4 q = (float4)(1.0, 1.0, 1.0, 0);
Image img1 = create_image_from_image2d(input, 2);
Image img2 = create_image_from_image2d(output, 2);
for(; coord.x < width; coord.x += 8)
{
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
float4 data0, data1;
uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
__global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
__global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
src = in_ptr[0];
VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, data0, val0, 16);
_viv_asm(COPY, data1, val1, 16);
float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));
float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));
tmpSum1 += tmpSum0.w;
tmpSum0 += preSum;
tmpSum1 += preSum;
preSum = tmpSum1.w;
_viv_asm(COPY, dst0, tmpSum0, 16);
_viv_asm(COPY, dst1, tmpSum1, 16);
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniExtractOddData_2x8);
out_ptr[0] = dst;
}
}

View File

@ -0,0 +1,259 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4;
_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4;
_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8;
_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8;
_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4;
_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4;
_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8;
_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4;
_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4;
_viv_uniform int width;
_viv_uniform int input_zp;
_viv_uniform float in_out_scale;
_viv_uniform float output_zp;
_viv_uniform int remainder;
_viv_uniform int w_size;
__kernel void cumsum_ex_rev_array_F16toF16_axis0(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev
)
{
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
vxc_short8 src, dst;
vxc_half8 data, tmpsum, sum;
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
if(exclusive == 0 && rev)
{
for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)
{
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
input_ptr = get_tensor_ptr_from_coord(img1, coord);
output_ptr = get_tensor_ptr_from_coord(img2, coord);
in_ptr = (__global vxc_short8*)input_ptr;
out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
_viv_asm(COPY, data, src, 16);
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniSumHorzRevF16toF16C_2x8);
VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
else if(exclusive && rev == 0)
{
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
for(; coord.x < width - 8;)
{
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global vxc_short8*)input_ptr;
src = in_ptr[0];
coord_out.x = coord.x + 1;
coord.x += 8;
_viv_asm(COPY, data, src, 16);
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global vxc_short8*)output_ptr;
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);
VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);
VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
else if(exclusive && rev)
{
coord.x = width - 8;
coord_out.x = width - 1;
_viv_asm(COPY, dst, sum, 16);
output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
out_ptr = (__global vxc_short8*)output_ptr;
out_ptr[0] = dst;
for(; coord.x > 0;)
{
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
input_ptr = get_tensor_ptr_from_coord(img1, coord);
output_ptr = get_tensor_ptr_from_coord(img2, coord);
in_ptr = (__global vxc_short8*)input_ptr;
out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
coord_out.x = coord.x - 1;
coord.x -= 8;
_viv_asm(COPY, data, src, 16);
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniSumHorzRevF16toF16C_2x8);
VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
}
#define CUMSUM_QINT_EX_REV_ARRAY_AXIS0(in_name, out_name, src_type, dst_type, stride_data) \
__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis0( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, int exclusive, int rev \
) \
{ \
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \
int4 coord_out = coord; \
\
src_type src; \
dst_type dst; \
vxc_short8 rowSum; \
int4 sum0 = (int4)(0), sum1 = (int4)(0); \
short zp = (short)input_zp; \
\
Tensor img1 = create_tensor_from_image2d_array(input, stride_data); \
Tensor img2 = create_tensor_from_image2d_array(output, stride_data); \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global src_type* in_ptr = (__global src_type*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
if(exclusive == 0 && rev) \
{ \
for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \
{ \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
in_ptr = (__global src_type*)input_ptr; \
out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
uniAccSumHorzRevI16toI32A_4x4); \
VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
uniAccSumHorzRevI16toI32B_4x4); \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8); \
out_ptr[0] = dst; \
} \
} \
else if(exclusive && rev == 0) \
{ \
for(coord.x = -1; coord.x < width - 8;) \
{ \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src = in_ptr[0]; \
coord_out.x = coord.x + 1; \
coord.x += 8; \
output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
out_ptr = (__global dst_type*)output_ptr; \
VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \
VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \
VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \
VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
uniAccSumHorzI16toI32A_4x4); \
VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
uniAccSumHorzI16toI32B_4x4); \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8); \
out_ptr[0] = dst; \
} \
} \
else if(exclusive && rev) \
{ \
for(coord.x = width - 7; coord.x > 0;) \
{ \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
in_ptr = (__global src_type*)input_ptr; \
out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
coord_out.x = coord.x - 1; \
coord.x -= 8; \
VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
uniAccSumHorzRevI16toI32A_4x4); \
VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
uniAccSumHorzRevI16toI32B_4x4); \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8); \
out_ptr[0] = dst; \
} \
} \
}
CUMSUM_QINT_EX_REV_ARRAY_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, 1)
CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I8, I8, vxc_char16, vxc_char16, 1)
CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I16, I16, vxc_short8, vxc_short8, 2)

View File

@ -0,0 +1,330 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
_viv_uniform int height;
_viv_uniform float in_out_scale;
_viv_uniform float in_out_zp_scale;
_viv_uniform float output_zp;
_viv_uniform int remainder;
_viv_uniform int w_size;
__kernel void cumsum_ex_rev_array_F16toF16_axis1(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev)
{
int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
vxc_short8 src, dst;
vxc_half8 data, sum;
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
if(exclusive == 0 && rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
output_ptr = get_tensor_ptr_from_coord(img2, coord);
in_ptr = (__global vxc_short8*)input_ptr;
out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
_viv_asm(COPY, data, src, 16);
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
else if(exclusive && rev == 0)
{
dst ^= dst;
out_ptr[0] = dst;
for(; coord.y < height - 1;)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global vxc_short8*)input_ptr;
src = in_ptr[0];
coord.y++;
_viv_asm(COPY, data, src, 16);
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global vxc_short8*)output_ptr;
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
else if(exclusive && rev)
{
dst ^= dst;
coord.y = height - 1;
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global vxc_short8*)output_ptr;
out_ptr[0] = dst;
for(; coord.y > 0;)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global vxc_short8*)input_ptr;
src = in_ptr[0];
coord.y--;
_viv_asm(COPY, data, src, 16);
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global vxc_short8*)output_ptr;
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
}
#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \
__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis1( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, int exclusive, int rev) \
{ \
int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
\
src_type src; \
dst_type dst; \
int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
\
if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
{ \
coord.x = coord.x - (16 - remainder); \
} \
Tensor img1 = create_tensor_from_image2d_array(input, 1); \
Tensor img2 = create_tensor_from_image2d_array(output, 1); \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global src_type* in_ptr = (__global src_type*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
if(exclusive == 0 && rev) \
{ \
for(coord.y = height - 1; coord.y >= 0; coord.y--) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
in_ptr = (__global src_type*)input_ptr; \
out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
int4 tmpDst2 = convert_int4_rte(tmpSum2); \
int4 tmpDst3 = convert_int4_rte(tmpSum3); \
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8); \
VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8); \
out_ptr[0] = dst; \
} \
} \
else if(exclusive && rev == 0) \
{ \
int tmpAlpha0 = convert_int_rte(output_zp); \
int4 tmpVal; \
tmpVal.x = tmpAlpha0; \
VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
for(; coord.y < height - 1;) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src = in_ptr[0]; \
coord.y++; \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global dst_type*)output_ptr; \
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
int4 tmpDst2 = convert_int4_rte(tmpSum2); \
int4 tmpDst3 = convert_int4_rte(tmpSum3); \
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8);\
VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8);\
out_ptr[0] = dst; \
} \
} \
else if(exclusive && rev) \
{ \
coord.y = height - 1; \
int tmpAlpha0 = convert_int_rte(output_zp); \
int4 tmpVal; \
tmpVal.x = tmpAlpha0; \
VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global vxc_short8*)output_ptr; \
out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
for(; coord.y > 0;) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src = in_ptr[0]; \
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
coord.y--; \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global dst_type*)output_ptr; \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
int4 tmpDst2 = convert_int4_rte(tmpSum2); \
int4 tmpDst3 = convert_int4_rte(tmpSum3); \
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8);\
VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8);\
out_ptr[0] = dst; \
} \
} \
}
CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)
CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(I8, I8, vxc_char16, vxc_char16)
__kernel void cumsum_ex_rev_array_I16toI16_axis1(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev)
{
int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
vxc_short8 src, dst;
int4 sum0 = (int4)(0), sum1 = (int4)(0);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
if(exclusive == 0 && rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
output_ptr = get_tensor_ptr_from_coord(img2, coord);
in_ptr = (__global vxc_short8*)input_ptr;
out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
int4 tmpDst0 = convert_int4_rte(tmpSum0);
int4 tmpDst1 = convert_int4_rte(tmpSum1);
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
uniConvertInt32toUint8_2x8);
out_ptr[0] = dst;
}
}
else if(exclusive && rev == 0)
{
int tmpAlpha0 = convert_int_rte(output_zp);
int4 tmpVal;
tmpVal.x = tmpAlpha0;
VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
out_ptr[0] = dst.xxxxxxxx;
for(; coord.y < height - 1;)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global vxc_short8*)input_ptr;
src = in_ptr[0];
coord.y++;
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global vxc_short8*)output_ptr;
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
int4 tmpDst0 = convert_int4_rte(tmpSum0);
int4 tmpDst1 = convert_int4_rte(tmpSum1);
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
uniConvertInt32toUint8_2x8);
out_ptr[0] = dst;
}
}
else if(exclusive && rev)
{
coord.y = height - 1;
int tmpAlpha0 = convert_int_rte(output_zp);
int4 tmpVal;
tmpVal.x = tmpAlpha0;
VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global vxc_short8*)output_ptr;
out_ptr[0] = dst.xxxxxxxx;
for(; coord.y > 0;)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global vxc_short8*)input_ptr;
src = in_ptr[0];
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
coord.y--;
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global vxc_short8*)output_ptr;
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
int4 tmpDst0 = convert_int4_rte(tmpSum0);
int4 tmpDst1 = convert_int4_rte(tmpSum1);
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
uniConvertInt32toUint8_2x8);
out_ptr[0] = dst;
}
}
}

View File

@ -0,0 +1,322 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
_viv_uniform int channel;
_viv_uniform float in_out_scale;
_viv_uniform float in_out_zp_scale;
_viv_uniform float output_zp;
_viv_uniform int remainder;
_viv_uniform int w_size;
__kernel void cumsum_ex_rev_array_F16toF16_axis2(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
vxc_short8 src, dst;
vxc_half8 data, sum;
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
if(rev && exclusive == 0)
{
for(coord.z = channel - 1; coord.z >= 0; coord.z--)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
output_ptr = get_tensor_ptr_from_coord(img2, coord);
in_ptr = (__global vxc_short8*)input_ptr;
out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
_viv_asm(COPY, data, src, 16);
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
else if(rev == 0 && exclusive)
{
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
for(; coord.z < channel - 1;)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global vxc_short8*)input_ptr;
src = in_ptr[0];
coord.z++;
_viv_asm(COPY, data, src, 16);
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
else if(rev && exclusive)
{
_viv_asm(COPY, dst, sum, 16);
coord.z = channel - 1;
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global vxc_short8*)output_ptr;
out_ptr[0] = dst;
for(; coord.z > 0;)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global vxc_short8*)input_ptr;
src = in_ptr[0];
coord.z--;
_viv_asm(COPY, data, src, 16);
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global vxc_short8*)output_ptr;
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
_viv_asm(COPY, dst, sum, 16);
out_ptr[0] = dst;
}
}
}
#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \
__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis2( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, int exclusive, int rev) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
\
src_type src; \
dst_type dst; \
int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
\
if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
{ \
coord.x = coord.x - (16 - remainder); \
} \
Tensor img1 = create_tensor_from_image2d_array(input, 1); \
Tensor img2 = create_tensor_from_image2d_array(output, 1); \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global src_type* in_ptr = (__global src_type*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
if(rev && exclusive == 0) \
{ \
for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
in_ptr = (__global src_type*)input_ptr; \
out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
int4 tmpDst2 = convert_int4_rte(tmpSum2); \
int4 tmpDst3 = convert_int4_rte(tmpSum3); \
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8);\
VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
uniConvertInt32toUint8_2x8);\
out_ptr[0] = dst; \
} \
} \
else if(exclusive && rev == 0) \
{ \
int tmpAlpha0 = convert_int_rte(output_zp); \
int4 tmpVal; \
tmpVal.x = tmpAlpha0; \
VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
for(; coord.z < channel - 1;) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src = in_ptr[0]; \
coord.z++; \
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
int4 tmpDst2 = convert_int4_rte(tmpSum2); \
int4 tmpDst3 = convert_int4_rte(tmpSum3); \
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8); \
VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
uniConvertInt32toUint8_2x8); \
out_ptr[0] = dst; \
} \
} \
else if(rev && exclusive) \
{ \
coord.z = channel - 1; \
int tmpAlpha0 = convert_int_rte(output_zp); \
int4 tmpVal; \
tmpVal.x = tmpAlpha0; \
VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global vxc_short8*)output_ptr; \
out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
for(; coord.z > 0;) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global src_type*)input_ptr; \
src = in_ptr[0]; \
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
coord.z--; \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global dst_type*)output_ptr; \
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
int4 tmpDst0 = convert_int4_rte(tmpSum0); \
int4 tmpDst1 = convert_int4_rte(tmpSum1); \
int4 tmpDst2 = convert_int4_rte(tmpSum2); \
int4 tmpDst3 = convert_int4_rte(tmpSum3); \
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniConvertInt32toUint8_2x8); \
VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),
uniConvertInt32toUint8_2x8); \
out_ptr[0] = dst; \
} \
} \
}
CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)
CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)
__kernel void cumsum_ex_rev_array_I16toI16_axis2(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, int exclusive, int rev)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
vxc_short8 src, dst;
int4 sum0 = (int4)(0), sum1 = (int4)(0);
if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
{
coord.x = coord.x - (8 - remainder);
}
Tensor img1 = create_tensor_from_image2d_array(input, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
__global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
if(exclusive == 0 && rev)
{
for(coord.z = channel - 1; coord.z >= 0; coord.z--)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
output_ptr = get_tensor_ptr_from_coord(img2, coord);
in_ptr = (__global vxc_short8*)input_ptr;
out_ptr = (__global vxc_short8*)output_ptr;
src = in_ptr[0];
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
int4 tmpDst0 = convert_int4_rte(tmpSum0);
int4 tmpDst1 = convert_int4_rte(tmpSum1);
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
uniConvertInt32toUint8_2x8);
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
}
else if(exclusive && rev == 0)
{
int tmpAlpha0 = convert_int_rte(output_zp);
int4 tmpVal;
tmpVal.x = tmpAlpha0;
VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
out_ptr[0] = dst.xxxxxxxx;
for(; coord.z < channel - 1;)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global vxc_short8*)input_ptr;
src = in_ptr[0];
coord.z++;
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp;
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
int4 tmpDst0 = convert_int4_rte(tmpSum0);
int4 tmpDst1 = convert_int4_rte(tmpSum1);
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
uniConvertInt32toUint8_2x8);
out_ptr[0] = dst;
}
}
else if(exclusive && rev)
{
coord.z = channel - 1;
int tmpAlpha0 = convert_int_rte(output_zp);
int4 tmpVal;
tmpVal.x = tmpAlpha0;
VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
output_ptr = get_tensor_ptr_from_coord(img2, coord);
out_ptr = (__global vxc_short8*)output_ptr;
out_ptr[0] = dst.xxxxxxxx;
for(; coord.z > 0;)
{
input_ptr = get_tensor_ptr_from_coord(img1, coord);
in_ptr = (__global vxc_short8*)input_ptr;
src = in_ptr[0];
VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
coord.z--;
float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
int4 tmpDst0 = convert_int4_rte(tmpSum0);
int4 tmpDst1 = convert_int4_rte(tmpSum1);
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
uniConvertInt32toUint8_2x8);
out_ptr[0] = dst;
}
}
}

View File

@ -0,0 +1,324 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
_viv_uniform int width;
_viv_uniform int height;
_viv_uniform int channel;
_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
_viv_uniform int remainder;
_viv_uniform int w_size;
#define CUMSUM_ARRAY_F16TOQINT_AXIS2(out_name, src_type, dst_type, stride_out) \
__kernel void cumsum_array_F16to##out_name##_axis2( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, int exclusive, int rev \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
\
vxc_short8 src; \
dst_type dst; \
vxc_half8 data, sum; \
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
Tensor img1 = create_tensor_from_image2d_array(input, 2); \
Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
for(coord.z = 0; coord.z < channel; coord.z++) \
{ \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
_viv_asm(COPY, data, src, 16); \
\
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
} \
}
CUMSUM_ARRAY_F16TOQINT_AXIS2(I8, vxc_half8, vxc_char16, 1)
CUMSUM_ARRAY_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8, 2)
CUMSUM_ARRAY_F16TOQINT_AXIS2(U8, vxc_half8, vxc_uchar16, 1)
#define CUMSUM_ARRAY_F16TOQINT_AXIS1(out_name, src_type, dst_type, stride_out) \
__kernel void cumsum_array_F16to##out_name##_axis1( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, int exclusive, int rev \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
\
vxc_short8 src; \
dst_type dst; \
vxc_half8 data, sum; \
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
Tensor img1 = create_tensor_from_image2d_array(input, 2); \
Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
for(coord.y = 0; coord.y < height; coord.y++) \
{ \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
_viv_asm(COPY, data, src, 16); \
\
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
} \
}
CUMSUM_ARRAY_F16TOQINT_AXIS1(I8, vxc_half8, vxc_char16, 1)
CUMSUM_ARRAY_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8, 2)
CUMSUM_ARRAY_F16TOQINT_AXIS1(U8, vxc_half8, vxc_uchar16, 1)
#define CUMSUM_ARRAY_F16TOQINT_AXIS0(out_name, src_type, dst_type, stride_out) \
__kernel void cumsum_array_F16to##out_name##_axis0( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, int exclusive, int rev \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
\
vxc_short8 src; \
dst_type dst; \
vxc_half8 data, tmpsum, sum; \
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
Tensor img1 = create_tensor_from_image2d_array(input, 2); \
Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
for(; coord.x < width; coord.x += 8) \
{ \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
_viv_asm(COPY, data, src, 16); \
\
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); \
VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); \
VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
} \
}
CUMSUM_ARRAY_F16TOQINT_AXIS0(I8, vxc_half8, vxc_char16, 1)
CUMSUM_ARRAY_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8, 2)
CUMSUM_ARRAY_F16TOQINT_AXIS0(U8, vxc_half8, vxc_uchar16, 1)
#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type, stride_out) \
__kernel void cumsum_array_ex_rev_F16to##out_name##_axis2( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, int exclusive, int rev \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
\
vxc_short8 src; \
dst_type dst; \
vxc_half8 data, sum; \
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
Tensor img1 = create_tensor_from_image2d_array(input, 2); \
Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
if(exclusive == 0 && rev) \
{ \
for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
in_ptr = (__global vxc_short8*)input_ptr; \
out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
_viv_asm(COPY, data, src, 16); \
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
} \
} \
else if(exclusive && rev == 0) \
{ \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
for(; coord.z < channel - 1;) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global vxc_short8*)input_ptr; \
src = in_ptr[0]; \
coord.z++; \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global dst_type*)output_ptr; \
_viv_asm(COPY, data, src, 16); \
\
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
} \
} \
else if(exclusive && rev) \
{ \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
coord.z = channel - 1; \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global dst_type*)output_ptr; \
out_ptr[0] = dst; \
for(; coord.z > 0;) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global vxc_short8*)input_ptr; \
src = in_ptr[0]; \
coord.z--; \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global dst_type*)output_ptr; \
_viv_asm(COPY, data, src, 16); \
\
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
} \
} \
}
CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I8, vxc_half8, vxc_char16, 1)
CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8, 2)
CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(U8, vxc_half8, vxc_uchar16, 1)
#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type, stride_out) \
__kernel void cumsum_array_ex_rev_F16to##out_name##_axis1( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, int exclusive, int rev \
) \
{ \
int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
\
vxc_short8 src; \
dst_type dst; \
vxc_half8 data, sum; \
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
Tensor img1 = create_tensor_from_image2d_array(input, 2); \
Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
if(exclusive == 0 && rev) \
{ \
for(coord.y = height - 1; coord.y >= 0; coord.y--) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
in_ptr = (__global vxc_short8*)input_ptr; \
out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
_viv_asm(COPY, data, src, 16); \
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
} \
} \
else if(exclusive && rev == 0) \
{ \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
for(; coord.y < height - 1;) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global vxc_short8*)input_ptr; \
src = in_ptr[0]; \
coord.y++; \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global dst_type*)output_ptr; \
_viv_asm(COPY, data, src, 16); \
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
} \
} \
else if(exclusive && rev) \
{ \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
coord.y = height - 1; \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global dst_type*)output_ptr; \
out_ptr[0] = dst; \
for(; coord.y > 0;) \
{ \
input_ptr = get_tensor_ptr_from_coord(img1, coord); \
in_ptr = (__global vxc_short8*)input_ptr; \
src = in_ptr[0]; \
coord.y--; \
output_ptr = get_tensor_ptr_from_coord(img2, coord); \
out_ptr = (__global dst_type*)output_ptr; \
_viv_asm(COPY, data, src, 16); \
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
} \
} \
}
CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I8, vxc_half8, vxc_char16, 1)
CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8, 2)
CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(U8, vxc_half8, vxc_uchar16, 1)

View File

@ -0,0 +1,108 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
_viv_uniform int width;
_viv_uniform int height;
_viv_uniform int channel;
_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
_viv_uniform int remainder;
_viv_uniform int w_size;
#define CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type, stride_out) \
__kernel void cumsum_array_F16to##out_name##_axis1_2D( \
__read_only image2d_t input, \
__write_only image2d_t output, \
int axis, int exclusive, int rev \
) \
{ \
int2 coord = (int2)(get_global_id(0), 0); \
\
vxc_short8 src; \
dst_type dst; \
vxc_half8 data, sum; \
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
Tensor img1 = create_tensor_from_image2d_array(input, 2); \
Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
for(; coord.y < height; coord.y++) \
{ \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
_viv_asm(COPY, data, src, 16); \
\
VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
uniAccSumVertF16toF16_2x8); \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
} \
}
CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I8, vxc_half8, vxc_char16, 1)
CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8, 2)
CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(U8, vxc_half8, vxc_uchar16, 1)
#define CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type, stride_out) \
__kernel void cumsum_array_F16to##out_name##_axis0_2D( \
__read_only image2d_t input, \
__write_only image2d_t output, \
int axis, int exclusive, int rev \
) \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
\
vxc_short8 src; \
dst_type dst; \
vxc_half8 data, tmpsum, sum; \
VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
Tensor img1 = create_tensor_from_image2d_array(input, 2); \
Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
for(; coord.x < width; coord.x += 8) \
{ \
if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
{ \
coord.x = coord.x - (8 - remainder); \
} \
uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
__global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
__global dst_type* out_ptr = (__global dst_type*)output_ptr; \
src = in_ptr[0]; \
_viv_asm(COPY, data, src, 16); \
\
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniSumHorzF16toF16A_4x4); \
VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
uniSumHorzF16toF16B_4x4); \
VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
uniSumHorzF16toF16C_2x8); \
VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
uniAccSumHorzF16toF16_2x8); \
VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \
out_ptr[0] = dst; \
} \
}
CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I8, vxc_half8, vxc_char16, 1)
CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8, 2)
CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(U8, vxc_half8, vxc_uchar16, 1)

View File

@ -92,3 +92,116 @@ __kernel void gather_nd_F16toF16_1D(
VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_array_I8toI8_1D(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
coord.w = indice.x;
Image img1 = create_image_from_image2d(input0, 1);
Image img2 = create_image_from_image2d(output, 1);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global char* data_ptr = (__global char*)input_ptr;
__global char* dst_ptr = (__global char*)output_ptr;
char src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_U8toU8_1D(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
coord.w = indice.x;
Image img1 = create_image_from_image2d(input0, 1);
Image img2 = create_image_from_image2d(output, 1);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global uchar* data_ptr = (__global uchar*)input_ptr;
__global uchar* dst_ptr = (__global uchar*)output_ptr;
uchar src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_I16toI16_1D(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
coord.w = indice.x;
Image img1 = create_image_from_image2d(input0, 2);
Image img2 = create_image_from_image2d(output, 2);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global short* data_ptr = (__global short*)input_ptr;
__global short* dst_ptr = (__global short*)output_ptr;
short src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_F16toF16_1D(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
coord.w = indice.x;
Image img1 = create_image_from_image2d(input0, 2);
Image img2 = create_image_from_image2d(output, 2);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global short* data_ptr = (__global short*)input_ptr;
__global short* dst_ptr = (__global short*)output_ptr;
short src = data_ptr[0];
dst_ptr[0] = src;
}

View File

@ -92,3 +92,116 @@ __kernel void gather_nd_F16toF16_2D(
VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_array_I8toI8_2D(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
Image img1 = create_image_from_image2d(input0, 1);
Image img2 = create_image_from_image2d(output, 1);
uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global char* data_ptr = (__global char*)input_ptr;
__global char* dst_ptr = (__global char*)output_ptr;
char src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_U8toU8_2D(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
Image img1 = create_image_from_image2d(input0, 1);
Image img2 = create_image_from_image2d(output, 1);
uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global uchar* data_ptr = (__global uchar*)input_ptr;
__global uchar* dst_ptr = (__global uchar*)output_ptr;
uchar src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_I16toI16_2D(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
Image img1 = create_image_from_image2d(input0, 2);
Image img2 = create_image_from_image2d(output, 2);
uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global short* data_ptr = (__global short*)input_ptr;
__global short* dst_ptr = (__global short*)output_ptr;
short src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_F16toF16_2D(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
Image img1 = create_image_from_image2d(input0, 2);
Image img2 = create_image_from_image2d(output, 2);
uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global short* data_ptr = (__global short*)input_ptr;
__global short* dst_ptr = (__global short*)output_ptr;
short src = data_ptr[0];
dst_ptr[0] = src;
}

View File

@ -80,3 +80,85 @@ __kernel void gather_nd_F16to##src1_type_name##_2D( \
GATHER_ND_F16_TO_QINT_2D(U8, vxc_uchar16)
GATHER_ND_F16_TO_QINT_2D(I8, vxc_char16)
GATHER_ND_F16_TO_QINT_2D(I16, vxc_short8)
#define GATHER_ND_ARRAY_QINT_TO_F16_2D(src0_type_name, read_type, ptr_type, stride) \
__kernel void gather_nd_array_##src0_type_name##toF16_2D( \
__read_only image2d_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int coord_dim \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
\
int4 coord = (int4)(0, gidy, gidx, 0); \
Image img = create_image_from_image2d(input1, 4); \
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
int4 indice = ((int4 *)indice_ptr)[0]; \
\
indice.x = indice.x * block_size + gidx; \
\
Image img1 = create_image_from_image2d(input0, stride); \
Image img2 = create_image_from_image2d(output, 2); \
\
uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
\
__global ptr_type data_ptr = (__global ptr_type)input_ptr; \
__global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
read_type src = data_ptr[0]; \
\
vxc_half8 src0; \
vxc_short8 dst0; \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
_viv_asm(COPY, dst0, src0, 16); \
dst_ptr[0] = dst0; \
}
GATHER_ND_ARRAY_QINT_TO_F16_2D(U8, vxc_uchar16, vxc_uchar16*, 1)
GATHER_ND_ARRAY_QINT_TO_F16_2D(I8, vxc_char16, vxc_char16*, 1)
GATHER_ND_ARRAY_QINT_TO_F16_2D(I16, vxc_short8, vxc_short8*, 2)
#define GATHER_ND_ARRAY_F16_TO_QINT_2D(src1_type_name, write_type, ptr_type, stride) \
__kernel void gather_nd_array_F16to##src1_type_name##_2D( \
__read_only image2d_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int coord_dim \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
\
int4 coord = (int4)(0, gidy, gidx, 0); \
Image img = create_image_from_image2d(input1, 4); \
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
int4 indice = ((int4 *)indice_ptr)[0]; \
\
indice.x = indice.x * block_size + gidx; \
\
Image img1 = create_image_from_image2d(input0, 2); \
Image img2 = create_image_from_image2d(output, stride); \
\
uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
\
__global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
__global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
vxc_short8 src = data_ptr[0]; \
\
vxc_ushort8 mp1; \
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
vxc_half8 data; \
write_type dst; \
_viv_asm(COPY, data, src, 16); \
VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1),uniConvertFp16toU8_2x8); \
dst_ptr[0] = dst; \
}
GATHER_ND_ARRAY_F16_TO_QINT_2D(U8, vxc_uchar16, vxc_uchar16*, 1)
GATHER_ND_ARRAY_F16_TO_QINT_2D(I8, vxc_char16, vxc_char16*, 1)
GATHER_ND_ARRAY_F16_TO_QINT_2D(I16, vxc_short8, vxc_short8*, 2)

View File

@ -98,3 +98,120 @@ __kernel void gather_nd_F16toF16_3D(
VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_array_I8toI8_3D(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
indice.w = 0;
Tensor img1 = create_tensor_from_image2d_array(input0, 1);
Image img2 = create_image_from_image2d(output, 1);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global char* data_ptr = (__global char*)input_ptr;
__global char* dst_ptr = (__global char*)output_ptr;
char src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_U8toU8_3D(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
indice.w = 0;
Tensor img1 = create_tensor_from_image2d_array(input0, 1);
Image img2 = create_image_from_image2d(output, 1);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global uchar* data_ptr = (__global uchar*)input_ptr;
__global uchar* dst_ptr = (__global uchar*)output_ptr;
uchar src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_I16toI16_3D(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
indice.w = 0;
Tensor img1 = create_tensor_from_image2d_array(input0, 2);
Image img2 = create_image_from_image2d(output, 2);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global short* data_ptr = (__global short*)input_ptr;
__global short* dst_ptr = (__global short*)output_ptr;
short src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_F16toF16_3D(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int4 coord = (int4)(0, gidy, gidx, 0);
Image img = create_image_from_image2d(input1, 4);
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
indice.w = 0;
Tensor img1 = create_tensor_from_image2d_array(input0, 2);
Image img2 = create_image_from_image2d(output, 2);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
__global short* data_ptr = (__global short*)input_ptr;
__global short* dst_ptr = (__global short*)output_ptr;
short src = data_ptr[0];
dst_ptr[0] = src;
}

View File

@ -80,3 +80,86 @@ GATHER_ND_F16_TO_QINT_3D(U8, vxc_uchar16)
GATHER_ND_F16_TO_QINT_3D(I8, vxc_char16)
GATHER_ND_F16_TO_QINT_3D(I16, vxc_short8)
#define GATHER_ND_ARRAY_QINT_TO_F16_3D(src0_type_name, read_type, ptr_type, stride) \
__kernel void gather_nd_array_##src0_type_name##toF16_3D( \
__read_only image2d_array_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int coord_dim \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
\
int4 coord = (int4)(0, gidy, gidx, 0); \
Image img = create_image_from_image2d(input1, 4); \
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
int4 indice = ((int4 *)indice_ptr)[0]; \
\
indice.x = indice.x * block_size + gidx; \
indice.w = 0; \
Tensor img1 = create_tensor_from_image2d_array(input0, stride); \
Image img2 = create_image_from_image2d(output, 2); \
\
uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
\
__global ptr_type data_ptr = (__global ptr_type)input_ptr; \
__global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
read_type src = data_ptr[0]; \
\
vxc_half8 src0; \
vxc_short8 dst0; \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
_viv_asm(COPY, dst0, src0, 16); \
dst_ptr[0] = dst0; \
}
GATHER_ND_ARRAY_QINT_TO_F16_3D(U8, vxc_uchar16, vxc_uchar16*, 1)
GATHER_ND_ARRAY_QINT_TO_F16_3D(I8, vxc_char16, vxc_char16*, 1)
GATHER_ND_ARRAY_QINT_TO_F16_3D(I16, vxc_short8, vxc_short8*, 2)
#define GATHER_ND_ARRAY_F16_TO_QINT_3D(src1_type_name, write_type, ptr_type, stride) \
__kernel void gather_nd_array_F16to##src1_type_name##_3D( \
__read_only image2d_array_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int coord_dim \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
\
int4 coord = (int4)(0, gidy, gidx, 0); \
Image img = create_image_from_image2d(input1, 4); \
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
int4 indice = ((int4 *)indice_ptr)[0]; \
\
indice.x = indice.x * block_size + gidx; \
indice.w = 0; \
\
Tensor img1 = create_tensor_from_image2d_array(input0, 2); \
Image img2 = create_image_from_image2d(output, stride); \
\
uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
\
__global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
__global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
vxc_short8 src = data_ptr[0]; \
\
vxc_ushort8 mp1; \
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
vxc_half8 data; \
write_type dst; \
_viv_asm(COPY, data, src, 16); \
VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1), uniConvertFp16toU8_2x8); \
dst_ptr[0] = dst; \
}
GATHER_ND_ARRAY_F16_TO_QINT_3D(U8, vxc_uchar16, vxc_uchar16*, 1)
GATHER_ND_ARRAY_F16_TO_QINT_3D(I8, vxc_char16, vxc_char16*, 1)
GATHER_ND_ARRAY_F16_TO_QINT_3D(I16, vxc_short8, vxc_short8*, 2)

View File

@ -95,3 +95,118 @@ __kernel void gather_nd_batch_F16toF16_1D(
VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_array_batch_I8toI8_1D(
__read_only image2d_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // index num
int gidz = get_global_id(2); // batch num
int4 coord = (int4)(gidx, gidy, gidz, 0);
Tensor img = create_tensor_from_image2d_array(input1, 4);
uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
Image img1 = create_image_from_image2d(input0, 1);
Tensor img2 = create_tensor_from_image2d_array(output, 1);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global char* data_ptr = (__global char*)input_ptr;
__global char* dst_ptr = (__global char*)output_ptr;
char src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_batch_U8toU8_1D(
__read_only image2d_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // index num
int gidz = get_global_id(2); // batch num
int4 coord = (int4)(gidx, gidy, gidz, 0);
Tensor img = create_tensor_from_image2d_array(input1, 4);
uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
Image img1 = create_image_from_image2d(input0, 1);
Tensor img2 = create_tensor_from_image2d_array(output, 1);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global uchar* data_ptr = (__global uchar*)input_ptr;
__global uchar* dst_ptr = (__global uchar*)output_ptr;
uchar src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_batch_I16toI16_1D(
__read_only image2d_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // index num
int gidz = get_global_id(2); // batch num
int4 coord = (int4)(gidx, gidy, gidz, 0);
Tensor img = create_tensor_from_image2d_array(input1, 4);
uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
Image img1 = create_image_from_image2d(input0, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global short* data_ptr = (__global short*)input_ptr;
__global short* dst_ptr = (__global short*)output_ptr;
short src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_batch_F16toF16_1D(
__read_only image2d_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // index num
int gidz = get_global_id(2); // batch num
int4 coord = (int4)(gidx, gidy, gidz, 0);
Tensor img = create_tensor_from_image2d_array(input1, 4);
uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
Image img1 = create_image_from_image2d(input0, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global short* data_ptr = (__global short*)input_ptr;
__global short* dst_ptr = (__global short*)output_ptr;
short src = data_ptr[0];
dst_ptr[0] = src;
}

View File

@ -26,7 +26,7 @@ __kernel void gather_nd_batch_I8toI8_2D(
VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_U8toU8_2D(
__kernel void gather_nd_batch_U8toU8_2D(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
@ -51,7 +51,7 @@ __kernel void gather_nd_U8toU8_2D(
VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_I16toI16_2D(
__kernel void gather_nd_batch_I16toI16_2D(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
@ -76,7 +76,7 @@ __kernel void gather_nd_I16toI16_2D(
VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_F16toF16_2D(
__kernel void gather_nd_batch_F16toF16_2D(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
@ -100,3 +100,123 @@ __kernel void gather_nd_F16toF16_2D(
VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_array_batch_I8toI8_2D(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // index num
int gidz = get_global_id(2); // batch num
int4 coord = (int4)(gidx, gidy, gidz, 0);
Tensor img = create_tensor_from_image2d_array(input1, 4);
uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
indice.zw = coord.zw;
Tensor img1 = create_tensor_from_image2d_array(input0, 1);
Tensor img2 = create_tensor_from_image2d_array(output, 1);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global char* data_ptr = (__global char*)input_ptr;
__global char* dst_ptr = (__global char*)output_ptr;
char src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_batch_U8toU8_2D(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // index num
int gidz = get_global_id(2); // batch num
int4 coord = (int4)(gidx, gidy, gidz, 0);
Tensor img = create_tensor_from_image2d_array(input1, 4);
uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
indice.zw = coord.zw;
Tensor img1 = create_tensor_from_image2d_array(input0, 1);
Tensor img2 = create_tensor_from_image2d_array(output, 1);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global uchar* data_ptr = (__global uchar*)input_ptr;
__global uchar* dst_ptr = (__global uchar*)output_ptr;
uchar src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_batch_I16toI16_2D(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // index num
int gidz = get_global_id(2); // batch num
int4 coord = (int4)(gidx, gidy, gidz, 0);
Tensor img = create_tensor_from_image2d_array(input1, 4);
uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
indice.zw = coord.zw;
Tensor img1 = create_tensor_from_image2d_array(input0, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global short* data_ptr = (__global short*)input_ptr;
__global short* dst_ptr = (__global short*)output_ptr;
short src = data_ptr[0];
dst_ptr[0] = src;
}
__kernel void gather_nd_array_batch_F16toF16_2D(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // index num
int gidz = get_global_id(2); // batch num
int4 coord = (int4)(gidx, gidy, gidz, 0);
Tensor img = create_tensor_from_image2d_array(input1, 4);
uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
indice.zw = coord.zw;
Tensor img1 = create_tensor_from_image2d_array(input0, 2);
Tensor img2 = create_tensor_from_image2d_array(output, 2);
uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
__global short* data_ptr = (__global short*)input_ptr;
__global short* dst_ptr = (__global short*)output_ptr;
short src = data_ptr[0];
dst_ptr[0] = src;
}

View File

@ -81,3 +81,85 @@ GATHER_ND_F16_TO_QINT_1D(U8, vxc_uchar16)
GATHER_ND_F16_TO_QINT_1D(I8, vxc_char16)
GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8)
#define GATHER_ND_ARRAY_QINT_TO_F16_1D(src0_type_name, read_type, ptr_type, stride) \
__kernel void gather_nd_array_##src0_type_name##toF16_1D( \
__read_only image2d_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int coord_dim \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
\
int4 coord = (int4)(0, gidy, gidx, 0); \
Image img = create_image_from_image2d(input1, 4); \
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
int4 indice = ((int4 *)indice_ptr)[0]; \
\
coord.w = indice.x; \
\
Image img1 = create_image_from_image2d(input0, stride); \
Image img2 = create_image_from_image2d(output, 2); \
\
uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
\
__global ptr_type data_ptr = (__global ptr_type)input_ptr; \
__global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
read_type src = data_ptr[0]; \
\
vxc_half8 src0; \
vxc_short8 dst0; \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
_viv_asm(COPY, dst0, src0, 16); \
dst_ptr[0] = dst0; \
}
GATHER_ND_ARRAY_QINT_TO_F16_1D(U8, vxc_uchar16, vxc_uchar16*, 1)
GATHER_ND_ARRAY_QINT_TO_F16_1D(I8, vxc_char16, vxc_char16*, 1)
GATHER_ND_ARRAY_QINT_TO_F16_1D(I16, vxc_short8, vxc_short8*, 2)
#define GATHER_ND_ARRAY_F16_TO_QINT_1D(src1_type_name, write_type, ptr_type, stride) \
__kernel void gather_nd_array_F16to##src1_type_name##_1D( \
__read_only image2d_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int coord_dim \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
\
int4 coord = (int4)(0, gidy, gidx, 0); \
Image img = create_image_from_image2d(input1, 4); \
uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
int4 indice = ((int4 *)indice_ptr)[0]; \
\
coord.w = indice.x; \
\
Image img1 = create_image_from_image2d(input0, 2); \
Image img2 = create_image_from_image2d(output, stride); \
\
uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
\
__global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
__global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
vxc_short8 src = data_ptr[0]; \
vxc_ushort8 mp1; \
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
vxc_half8 data; \
write_type dst; \
_viv_asm(COPY, data, src, 16); \
VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
dst_ptr[0] = dst; \
}
GATHER_ND_ARRAY_F16_TO_QINT_1D(U8, vxc_uchar16, vxc_uchar16*, 1)
GATHER_ND_ARRAY_F16_TO_QINT_1D(I8, vxc_char16, vxc_char16*, 1)
GATHER_ND_ARRAY_F16_TO_QINT_1D(I16, vxc_short8, vxc_short8*, 2)

View File

@ -65,5 +65,5 @@ __kernel void pre_process_gray_half_U8toU8
coord_in.xy = coord_in.xy >> 1;
VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}

File diff suppressed because it is too large Load Diff

View File

@ -62,11 +62,20 @@ static vsi_status _argmaxmin_op_compute
}
status = VSI_FAILURE;
param =vsi_nn_kernel_param_create();
param = vsi_nn_kernel_param_create();
if (strcmp(kernel_name, "argmax") == 0)
{
vsi_nn_argmax_param * p = &(self->nn_param.argmax);
axis = p->axis;
#if (VX_ARGMAX_VX_SUPPORT)
vsi_nn_kernel_param_add_int32(param, "axis", axis);
self->n = (vx_node)vsi_nn_kernel_selector(self->graph,
kernel_name,
inputs, 1,
outputs, 1, param);
goto final;
#endif
}
else
{
@ -101,6 +110,10 @@ static vsi_status _argmaxmin_op_compute
vsi_nn_ReleaseTensor( &reshape_tensors[0] );
vsi_nn_ReleaseTensor( &reshape_tensors[1] );
}
#if (VX_ARGMAX_VX_SUPPORT)
final:
#endif
if( self->n )
{
status = VSI_SUCCESS;

View File

@ -0,0 +1,153 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_log.h"
#include "vsi_nn_node.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_constraint_check.h"
#include "vsi_nn_error.h"
typedef struct _bitcast_local_data_t {
int32_t placeholder;
} bitcast_local_data_t;
/*
Declare number of input and output.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_t n = NULL;
n = vsi_nn_kernel_selector( self->graph, "bitcast", inputs, 1, outputs, 1, NULL );
if (n != NULL)
{
status = VSI_SUCCESS;
}
self->n = (vx_node)n;
return status;
} /* op_compute() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
int32_t i = 0;
VSI_UNREFERENCED(self);
if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
{
uint32_t input_byte = 0;
uint32_t output_byte = 0;
uint32_t in_dim = inputs[0]->attr.dim_num;
input_byte = vsi_nn_TypeGetBytesExt(inputs[0]->attr.dtype.vx_type);
output_byte = vsi_nn_TypeGetBytesExt(outputs[0]->attr.dtype.vx_type);
if (input_byte == output_byte)
{
outputs[0]->attr.dim_num = in_dim;
for (i = 0; i < (int32_t)(in_dim); i++)
{
outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
}
}
else if (input_byte > output_byte)
{
outputs[0]->attr.dim_num = in_dim + 1;
outputs[0]->attr.size[0] = input_byte / output_byte;
for (i = 1;i < (int32_t)(outputs[0]->attr.dim_num); i++)
{
outputs[0]->attr.size[i] = inputs[0]->attr.size[i - 1];
}
}
else
{
if ((uint32_t)(inputs[0]->attr.size[in_dim - 1]) != output_byte / input_byte)
{
VSILOGE("If input datatype is smaller than output datatype, bitcast op requires that \
the rightmost dimension be equal to sizeof(output datatype) / sizeof(input datatype)");
return FALSE;
}
outputs[0]->attr.dim_num = in_dim - 1;
if (outputs[0]->attr.dim_num == 0)
{
outputs[0]->attr.size[0] = 1;
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
}
else
{
for (i = 0; i < (int32_t)(outputs[0]->attr.dim_num); i++)
{
outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1];
}
}
}
}
return TRUE;
} /* op_setup() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ BITCAST,
/* init */ NULL,
/* compute */ op_compute,
/* deinit */ NULL,
/* check */ NULL,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
__END_DECLS

View File

@ -0,0 +1,258 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_log.h"
#include "vsi_nn_node.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_constraint_check.h"
typedef struct _col2im_local_data_t {
int32_t placeholder;
} col2im_local_data_t;
/*
Declare number of input and output.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t* param = NULL;
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "stride_w", self->nn_param.col2im.strides[0] );
vsi_nn_kernel_param_add_int32( param, "stride_h", self->nn_param.col2im.strides[1] );
vsi_nn_kernel_param_add_int32( param, "stride_d", self->nn_param.col2im.strides[2] );
vsi_nn_kernel_param_add_int32( param, "pad_w_front", self->nn_param.col2im.pads[0] );
vsi_nn_kernel_param_add_int32( param, "pad_w_end", self->nn_param.col2im.pads[1] );
vsi_nn_kernel_param_add_int32( param, "pad_h_front", self->nn_param.col2im.pads[2] );
vsi_nn_kernel_param_add_int32( param, "pad_h_end", self->nn_param.col2im.pads[3] );
vsi_nn_kernel_param_add_int32( param, "pad_d_front", self->nn_param.col2im.pads[4] );
vsi_nn_kernel_param_add_int32( param, "pad_d_end", self->nn_param.col2im.pads[5] );
vsi_nn_kernel_param_add_int32( param, "dilation_w", self->nn_param.col2im.dilations[0] );
vsi_nn_kernel_param_add_int32( param, "dilation_h", self->nn_param.col2im.dilations[1] );
vsi_nn_kernel_param_add_int32( param, "dilation_d", self->nn_param.col2im.dilations[2] );
vsi_nn_kernel_param_add_buffer( param, "block_shape", (void*)self->nn_param.col2im.block_shape, \
self->nn_param.col2im.dim_num );
self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "col2im",
inputs, 1, outputs, 1, param );
if (self->n)
{
status = VSI_SUCCESS;
}
return status;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
BEGIN_IO_TYPE_DECL(COL2IM, 1, 1)
IO_TYPE(D_F32, D_F32)
IO_TYPE(D_F32, D_I32)
IO_TYPE(D_F32, D_U32)
IO_TYPE(D_F32, D_F16)
IO_TYPE(D_I32, D_F32)
IO_TYPE(D_I32, D_I32)
IO_TYPE(D_I32, D_U32)
IO_TYPE(D_I32, D_F16)
IO_TYPE(D_U32, D_F32)
IO_TYPE(D_U32, D_I32)
IO_TYPE(D_U32, D_U32)
IO_TYPE(D_F16, D_I16|Q_DFP)
IO_TYPE(D_F16, D_I16|Q_ASYM)
IO_TYPE(D_F16, D_I16|Q_SYM)
IO_TYPE(D_F16, D_I8|Q_DFP)
IO_TYPE(D_F16, D_I8|Q_ASYM)
IO_TYPE(D_F16, D_I8|Q_SYM)
IO_TYPE(D_F16, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_DFP, D_F16)
IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP)
IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_ASYM, D_F16)
IO_TYPE(D_I16|Q_ASYM, D_I8|Q_DFP)
IO_TYPE(D_I16|Q_ASYM, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_F16)
IO_TYPE(D_I16|Q_SYM, D_I8|Q_DFP)
IO_TYPE(D_I16|Q_SYM, D_U8|Q_ASYM)
IO_TYPE(D_I16, D_F16)
IO_TYPE(D_I16, D_I8|Q_DFP)
IO_TYPE(D_I16, D_U8|Q_ASYM)
IO_TYPE(D_I16, D_I32)
IO_TYPE(D_I16, D_U32)
IO_TYPE(D_I16, D_F32)
IO_TYPE(D_I8|Q_DFP, D_F16)
IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP)
IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM)
IO_TYPE(D_I8|Q_ASYM, D_F16)
IO_TYPE(D_I8|Q_ASYM, D_I16|Q_DFP)
IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM)
IO_TYPE(D_I8|Q_SYM, D_F16)
IO_TYPE(D_I8|Q_SYM, D_I16|Q_DFP)
IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM)
IO_TYPE(D_I8, D_F16)
IO_TYPE(D_I8, D_I16|Q_DFP)
IO_TYPE(D_I8, D_U8|Q_ASYM)
IO_TYPE(D_I8, D_I32)
IO_TYPE(D_I8, D_U32)
IO_TYPE(D_I8, D_F32)
IO_TYPE(D_U8|Q_ASYM, D_F16)
IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP)
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP)
IO_TYPE(D_U8, D_F16)
IO_TYPE(D_U8, D_I16|Q_DFP)
IO_TYPE(D_U8, D_I8|Q_DFP)
IO_TYPE(D_U8, D_I32)
IO_TYPE(D_U8, D_U32)
IO_TYPE(D_U8, D_F32)
IO_TYPE(D_F32, D_I16|Q_DFP)
IO_TYPE(D_F32, D_I16|Q_ASYM)
IO_TYPE(D_F32, D_I16|Q_SYM)
IO_TYPE(D_F32, D_I8|Q_DFP)
IO_TYPE(D_F32, D_I8|Q_ASYM)
IO_TYPE(D_F32, D_I8|Q_SYM)
IO_TYPE(D_F32, D_U8|Q_ASYM)
IO_TYPE(D_I32, D_I16|Q_DFP)
IO_TYPE(D_I32, D_I16|Q_ASYM)
IO_TYPE(D_I32, D_I16|Q_SYM)
IO_TYPE(D_I32, D_I8|Q_DFP)
IO_TYPE(D_I32, D_I8|Q_ASYM)
IO_TYPE(D_I32, D_I8|Q_SYM)
IO_TYPE(D_I32, D_U8|Q_ASYM)
IO_TYPE(D_F16, D_F32)
IO_TYPE(D_F16, D_I32)
IO_TYPE(D_F16, D_I16)
IO_TYPE(D_F16, D_U8)
IO_TYPE(D_F16, D_I8)
IO_TYPE(D_F16, D_F16)
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM)
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM)
IO_TYPE(D_U8|Q_ASYM, D_F32)
IO_TYPE(D_U8|Q_ASYM, D_I32)
IO_TYPE(D_BF16, D_BF16)
END_IO_TYPE_DECL(COL2IM)
if (!VALIDATE_OP_IO_TYPES(COL2IM, self, inputs, self->input.num, outputs, self->output.num)) {
char* desc = generate_op_io_types_desc(inputs,
self->input.num, outputs, self->output.num);
VSILOGE("Inputs/Outputs data type not support: %s", desc);
destroy_op_io_types_desc(desc);
return FALSE;
}
return TRUE;
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_nn_col2im_param *p = NULL;
p = (vsi_nn_col2im_param* )&(self->nn_param.col2im);
int32_t i = 0;
vsi_size_t block_size = 1;
vsi_size_t channel = 1;
if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
{
outputs[0]->attr.dim_num = p->dim_num + 2;
for (i = 0; i < p->dim_num; i++)
{
outputs[0]->attr.size[i] = (vsi_size_t)p->image_shape[i];
block_size = block_size * (vsi_size_t)p->block_shape[i];
}
channel = inputs[0]->attr.size[1] / block_size;
outputs[0]->attr.size[i + 1] = channel;
outputs[0]->attr.size[i + 2] = inputs[0]->attr.size[0];
}
return TRUE;
} /* op_setup() */
static vsi_status op_init
(
vsi_nn_node_t* self
)
{
self->nn_param.col2im.pads[0] = 0;
self->nn_param.col2im.pads[1] = 0;
self->nn_param.col2im.pads[2] = 0;
self->nn_param.col2im.pads[3] = 0;
self->nn_param.col2im.pads[4] = 0;
self->nn_param.col2im.pads[5] = 0;
self->nn_param.col2im.strides[0] = 1;
self->nn_param.col2im.strides[1] = 1;
self->nn_param.col2im.strides[2] = 1;
self->nn_param.col2im.dilations[0] = 1;
self->nn_param.col2im.dilations[1] = 1;
self->nn_param.col2im.dilations[2] = 1;
return VSI_SUCCESS;
}
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ COL2IM,
/* init */ op_init,
/* compute */ op_compute,
/* deinit */ vsi_nn_op_common_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
__END_DECLS

View File

@ -28,6 +28,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_node.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
@ -278,7 +279,7 @@ static vsi_status op_compute
if(_is_tensorview_support(self, outputs)
&& _is_same_quant(self, inputs, outputs)
&& (_has_norm_input(self, inputs) == FALSE)
&& self->graph->ctx->options.enable_concat_optimize)
&& ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_concat_optimize)
{
iter = self->nn_param.concat.lcl_data;
while( NULL != iter )
@ -443,7 +444,7 @@ static vsi_status op_optimize
if (_is_tensorview_support(self, outputs) == FALSE ||
_is_same_quant(self, inputs, outputs) == FALSE ||
_has_norm_input(self, inputs) == TRUE ||
self->graph->ctx->options.enable_concat_optimize == 0)
((vsi_nn_graph_prv_t*)(self->graph))->options->enable_concat_optimize == 0)
{
return status;
}

View File

@ -23,6 +23,7 @@
*****************************************************************************/
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_graph.h"
@ -95,7 +96,7 @@ static vsi_status op_optimize
status = VSI_SUCCESS;
if( !self->graph->ctx->options.enable_dataconvert_optimize )
if( !((vsi_nn_graph_prv_t*)(self->graph))->options->enable_dataconvert_optimize )
{
return status;
}
@ -266,14 +267,14 @@ static vsi_bool op_check
IO_TYPE(D_BF16, D_BF16)
IO_TYPE(D_BF16, D_F16)
IO_TYPE(D_BF16, D_F32)
IO_TYPE(D_I32, D_I32)
IO_TYPE(D_I32, D_F32)
IO_TYPE(D_I32, D_F16)
IO_TYPE(D_I32, D_I16|Q_DFP)
IO_TYPE(D_I32, D_I8|Q_DFP)
IO_TYPE(D_I32, D_U32)
IO_TYPE(D_I32, D_U16)
IO_TYPE(D_I32, D_U8|Q_ASYM)
IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM)
IO_TYPE(D_I32|Q_ASYM, D_F32)
IO_TYPE(D_I32|Q_ASYM, D_F16)
IO_TYPE(D_I32|Q_ASYM, D_I16|Q_DFP)
IO_TYPE(D_I32|Q_ASYM, D_I8|Q_DFP)
IO_TYPE(D_I32|Q_ASYM, D_U32|Q_ASYM)
IO_TYPE(D_I32|Q_ASYM, D_U16|Q_ASYM)
IO_TYPE(D_I32|Q_ASYM, D_U8|Q_ASYM)
IO_TYPE(D_U32, D_U32)
IO_TYPE(D_U32, D_I16|Q_DFP)
IO_TYPE(D_U32, D_I8|Q_DFP)
@ -281,7 +282,7 @@ static vsi_bool op_check
IO_TYPE(D_U32, D_U8|Q_ASYM)
IO_TYPE(D_U32, D_U8)
IO_TYPE(D_BF16, D_I32)
IO_TYPE(D_I32, D_BF16)
IO_TYPE(D_I32|Q_ASYM, D_BF16)
IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM)
IO_TYPE(D_U4|Q_SYM, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_U4|Q_ASYM)

View File

@ -183,10 +183,16 @@ vsi_bool vsi_nn_op_eltwise_setup
shape[i] = sz0;
}
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = out_rank;
memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
if (out_rank == 1 &&
vsi_nn_GetTensorIsScalar(inputs[0]) &&
vsi_nn_GetTensorIsScalar(inputs[1]))
{
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
}
}
else
{

View File

@ -54,10 +54,12 @@ static vsi_status op_compute
vsi_nn_kernel_param_t* param = NULL;
int32_t align_corners = self->nn_param.gridsample.align_corners;
int32_t pad_mode = (int32_t)self->nn_param.gridsample.padding_mode;
int32_t mode = (int32_t)self->nn_param.gridsample.mode;
vsi_nn_kernel_node_t n;
char kernel_name[128];
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32(param, "mode", mode);
vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
vsi_nn_kernel_param_add_int32(param, "padding_mode", pad_mode);

View File

@ -0,0 +1,412 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_log.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_node.h"
#include "vsi_nn_prv.h"
#include "utils/vsi_nn_math.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_constraint_check.h"
/*
Declare number of input and output.
*/
#define _ARG_NUM (1)
#define _INPUT_NUM (3)
#define _OUTPUT_NUM (1)
#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
#define _PARAM_NUM (_ARG_NUM + _IO_NUM)
#define LOCAL() ((vsi_nn_grouped_conv3d_param_local_data *)nn_param->local)
typedef struct _vsi_nn_grouped_conv3d_param_local_data {
vsi_nn_tensor_t ** input_tensor_group;
vsi_nn_tensor_t ** weight_tensor_group;
vsi_nn_tensor_t ** bias_tensor_group;
vsi_nn_tensor_t ** output_tensor_group;
} vsi_nn_grouped_conv3d_param_local_data;
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
#if VX_CONV_3D_API_SUPPORT
#define _TENSOR_LEN 64
vsi_bool res;
uint32_t i;
char tensor_name[_TENSOR_LEN];
vsi_nn_grouped_conv3d_param *nn_param = &self->nn_param.grouped_conv3d;
nn_param->local = (vsi_nn_grouped_conv3d_param_local_data*)malloc(
sizeof(vsi_nn_grouped_conv3d_param_local_data));
if (NULL == nn_param->local)
{
VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
}
memset(nn_param->local, 0, sizeof(vsi_nn_grouped_conv3d_param_local_data));
LOCAL()->input_tensor_group = (vsi_nn_tensor_t **)malloc(
nn_param->group * sizeof(vsi_nn_tensor_t *));
if (NULL == LOCAL()->input_tensor_group)
{
VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
}
memset(LOCAL()->input_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
res = vsi_nn_CreateTensorGroup(self->graph, inputs[0], 3,
LOCAL()->input_tensor_group, nn_param->group);
if (res == FALSE)
{
VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
}
LOCAL()->weight_tensor_group = (vsi_nn_tensor_t **)malloc(
nn_param->group * sizeof(vsi_nn_tensor_t *));
if (NULL == LOCAL()->weight_tensor_group)
{
VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
}
memset(LOCAL()->weight_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 4,
LOCAL()->weight_tensor_group, nn_param->group);
if (res == FALSE)
{
VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
}
LOCAL()->bias_tensor_group = (vsi_nn_tensor_t **)malloc(
nn_param->group * sizeof(vsi_nn_tensor_t *));
if (NULL == LOCAL()->bias_tensor_group)
{
VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
}
memset(LOCAL()->bias_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
if (inputs[2] != NULL)
{
res = vsi_nn_CreateTensorGroup(self->graph, inputs[2], 0,
LOCAL()->bias_tensor_group, nn_param->group);
if (res == FALSE)
{
VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
}
}
LOCAL()->output_tensor_group = (vsi_nn_tensor_t **)malloc(
nn_param->group * sizeof(vsi_nn_tensor_t *));
if (NULL == LOCAL()->output_tensor_group)
{
VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
}
memset(LOCAL()->output_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
res = vsi_nn_CreateTensorGroup(self->graph, outputs[0], 3,
LOCAL()->output_tensor_group, nn_param->group);
if (res == FALSE)
{
VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
}
for (i = 0; i < nn_param->group; i++)
{
vx_tensor bias;
vx_nn_convolution_3d_params_t *param = NULL;
vx_nn_convolution_3d_params_t param_;
memset( &param_, 0, sizeof( vx_nn_convolution_3d_params_t ) );
param = &param_;
param->padding_w_left = self->nn_param.grouped_conv3d.pad[0];
param->padding_w_right = self->nn_param.grouped_conv3d.pad[1];
param->padding_h_top = self->nn_param.grouped_conv3d.pad[2];
param->padding_h_bottom = self->nn_param.grouped_conv3d.pad[3];
param->padding_d_front = self->nn_param.grouped_conv3d.pad[4];
param->padding_d_rear = self->nn_param.grouped_conv3d.pad[5];
param->stride_w = self->nn_param.grouped_conv3d.stride[0];
param->stride_h = self->nn_param.grouped_conv3d.stride[1];
param->stride_d = self->nn_param.grouped_conv3d.stride[2];
if (self->nn_param.grouped_conv3d.dilation[0] *
self->nn_param.grouped_conv3d.dilation[1] *
self->nn_param.grouped_conv3d.dilation[2] > 1)
{
VSILOGE("conv3d could not support dilation > 1\n");
return VSI_FAILURE;
}
if ( self->nn_param.grouped_conv3d.dilation[0] > 0 )
{
param->dilation_w = self->nn_param.grouped_conv3d.dilation[0] - 1;
}
if ( self->nn_param.grouped_conv3d.dilation[1] > 0 )
{
param->dilation_h = self->nn_param.grouped_conv3d.dilation[1] - 1;
}
if ( self->nn_param.grouped_conv3d.dilation[2] > 0 )
{
param->dilation_d = self->nn_param.grouped_conv3d.dilation[2] - 1;
}
param->pad_mode = vsi_nn_get_vx_pad_mode(nn_param->pad_mode);
param->depth_multiplier = self->nn_param.grouped_conv3d.multiplier;
param->overflow_policy = self->vx_param.overflow_policy;
param->rounding_policy = self->vx_param.rounding_policy;
param->down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
if ( inputs[2] == NULL )
{
bias = NULL;
}
else
{
bias = LOCAL()->bias_tensor_group[i]->t;
}
self->n = vxConv3dLayer(
self->graph->g,
LOCAL()->input_tensor_group[i]->t,
LOCAL()->weight_tensor_group[i]->t,
bias,
(vx_nn_convolution_3d_params_t* )param,
sizeof( vx_nn_convolution_3d_params_t),
LOCAL()->output_tensor_group[i]->t
);
memset(tensor_name, 0, sizeof(tensor_name));
snprintf(tensor_name, sizeof(tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, i);
if (vxSetReferenceName((vx_reference)LOCAL()->output_tensor_group[i]->t, tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u copy node output name fail", self->uid);
return VSI_FAILURE;
}
if ( NULL == self->n )
{
VSILOGE("Add vxConvolutionLayer fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
}
else
{
// no need to maintain self->n
vxReleaseNode( &self->n );
self->n = NULL;
}
}
#else
VSI_UNREFERENCED(self);
VSI_UNREFERENCED(inputs);
VSI_UNREFERENCED(outputs);
#endif
return VSI_SUCCESS;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_bool ret = FALSE;
ret = vsi_nn_OpCheck(VSI_NN_OP_CONV3D, self, inputs, outputs);
return ret;
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
/* TODO: Add code to comput outputs' shape. */
vsi_nn_grouped_conv3d_param *nn_param;
vsi_size_t perm[] = { 3, 2, 0, 1 };
#ifdef VX_CONVERT_POLICY_WRAP_ENABLE
if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 )
{
self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
}
#endif
if ( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt &&
VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type )
{
vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL );
inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW;
}
nn_param = &self->nn_param.grouped_conv3d;
{
vsi_size_t i, pad[_cnt_of_array(nn_param->pad)] = {0};
for (i = 0; i < _cnt_of_array(nn_param->pad); i++)
{
pad[i] = self->nn_param.grouped_conv3d.pad[i];
}
vsi_nn_compute_padding_3d(
inputs[0]->attr.size,
inputs[1]->attr.size,
nn_param->stride,
nn_param->dilation,
nn_param->pad_type,
pad
);
for (i = 0; i < _cnt_of_array(nn_param->pad); i++)
{
self->nn_param.grouped_conv3d.pad[i] = (uint32_t)pad[i];
}
}
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
(
inputs[0]->attr.size[0],
inputs[1]->attr.size[0],
&nn_param->pad[0],
nn_param->stride[0],
nn_param->dilation[0],
VSI_NN_ROUND_FLOOR
);
outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize
(
inputs[0]->attr.size[1],
inputs[1]->attr.size[1],
&nn_param->pad[2],
nn_param->stride[1],
nn_param->dilation[1],
VSI_NN_ROUND_FLOOR
);
outputs[0]->attr.size[2] = vsi_nn_ComputeFilterSize
(
inputs[0]->attr.size[2],
inputs[1]->attr.size[2],
&nn_param->pad[4],
nn_param->stride[2],
nn_param->dilation[2],
VSI_NN_ROUND_FLOOR
);
if (self->nn_param.grouped_conv3d.weights > 0)
{
outputs[0]->attr.size[3] = self->nn_param.grouped_conv3d.weights;
}
else if (self->nn_param.grouped_conv3d.multiplier > 0)
{
outputs[0]->attr.size[3] = inputs[0]->attr.size[3] * self->nn_param.grouped_conv3d.multiplier;
}
else
{
outputs[0]->attr.size[3] = inputs[1]->attr.size[4];
}
outputs[0]->attr.size[4] = inputs[0]->attr.size[4];
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
}
return TRUE;
} /* op_setup() */
static vsi_status op_deinit
(
vsi_nn_node_t* self
)
{
vsi_nn_grouped_conv3d_param *nn_param = &(self->nn_param.grouped_conv3d);
uint32_t i;
if (LOCAL())
{
if (LOCAL()->input_tensor_group)
{
for (i = 0; i < nn_param->group; i++)
{
vsi_nn_ReleaseTensor(&(LOCAL()->input_tensor_group[i]));
}
free(LOCAL()->input_tensor_group);
}
if (LOCAL()->weight_tensor_group)
{
for (i = 0; i < nn_param->group; i++)
{
vsi_nn_ReleaseTensor(&(LOCAL()->weight_tensor_group[i]));
}
free(LOCAL()->weight_tensor_group);
}
if (LOCAL()->bias_tensor_group != NULL)
{
for (i = 0; i < nn_param->group; i++)
{
vsi_nn_ReleaseTensor(&(LOCAL()->bias_tensor_group[i]));
}
free(LOCAL()->bias_tensor_group);
}
if (LOCAL()->output_tensor_group != NULL)
{
for (i = 0; i < nn_param->group; i++)
{
vsi_nn_ReleaseTensor(&(LOCAL()->output_tensor_group[i]));
}
free(LOCAL()->output_tensor_group);
}
free(LOCAL());
}
vsi_nn_op_common_deinit(self);
return VSI_SUCCESS;
} /* op_deinit() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ GROUPED_CONV3D,
/* init */ NULL,
/* compute */ op_compute,
/* deinit */ op_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
__END_DECLS

View File

@ -0,0 +1,206 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_log.h"
#include "vsi_nn_node.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
#include "vsi_nn_tensor_util_prv.h"
typedef struct _l1_layer_norm_local_data_t {
int32_t placeholder;
} l1_layer_norm_local_data_t;
/*
Declare number of input and output.
*/
#define _INPUT_NUM (4)
#define _OUTPUT_NUM (1)
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_kernel_node_t n = NULL;
float eps = self->nn_param.l1_layer_norm.eps;
int32_t axis = self->nn_param.l1_layer_norm.axis;
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_float32( param, "eps", eps );
vsi_nn_kernel_param_add_int32( param, "axis", axis );
n = vsi_nn_kernel_selector( self->graph, "l1_layer_norm",
inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
if ( n != NULL )
{
self->n = (vx_node)n;
status = VSI_SUCCESS;
}
if (param != NULL)
{
vsi_nn_kernel_param_release( &param );
}
return status;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
if (!ret)
{
BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_SYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_SYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_SYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_SYM)
IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_BF16)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_I16|Q_SYM)
IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_I8|Q_ASYM)
IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_I8|Q_SYM)
IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F16)
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_F32, D_I16|Q_SYM)
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_F16)
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_F32, D_F16)
IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_F32, D_F16)
IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F32, D_I8|Q_ASYM)
IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_F32, D_I8|Q_SYM)
IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_F16)
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F32, D_F16)
IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_F32, D_F16)
END_IO_TYPE_DECL(L1_LAYER_NORM)
if (!VALIDATE_OP_IO_TYPES(L1_LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num))
{
char* desc = generate_op_io_types_desc(inputs,
self->input.num, outputs, self->output.num);
VSILOGE("Inputs/Outputs data type not support: %s", desc);
destroy_op_io_types_desc(desc);
return FALSE;
}
}
return TRUE;
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
int32_t i = 0;
VSI_UNREFERENCED(self);
if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++)
{
outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
}
}
return TRUE;
} /* op_setup() */
static vsi_status op_init
(
vsi_nn_node_t* self
)
{
vsi_status status = VSI_SUCCESS;
self->nn_param.l1_layer_norm.axis = 0;
return status;
} /* op_init() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ L1_LAYER_NORM,
/* init */ op_init,
/* compute */ op_compute,
/* deinit */ NULL,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
__END_DECLS

View File

@ -25,6 +25,7 @@
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_log.h"
#include "vsi_nn_graph.h"
@ -161,7 +162,7 @@ static vsi_bool op_setup
if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP)
{
enable_rgb88_planar_nhwc = self->graph->ctx->options.enable_rgb88_planar_nhwc;
enable_rgb88_planar_nhwc = ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_rgb88_planar_nhwc;
}
}

View File

@ -183,7 +183,8 @@ static vsi_bool _check_is_sp_supported_type
return FALSE;
}
if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2)) ||
if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2 ||
(axes[0] == 1 && (input->attr.size[0] == 1 || input->attr.size[2] == 1)))) ||
(axes_num == 2 && ((axes[0] < 2 && axes[1] < 2) || (axes[0] == 1 && axes[1] == 2))) )
{
return TRUE;
@ -1167,6 +1168,7 @@ static vsi_bool op_setup
{
outputs[0]->attr.dim_num = 1;
outputs[0]->attr.size[0] = 1;
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
}
else
{

View File

@ -93,52 +93,32 @@ static vsi_bool op_check
if (!ret)
{
BEGIN_IO_TYPE_DECL(RMS_NORM, 2, 1)
IO_TYPE(D_F32, D_F32, D_F32)
IO_TYPE(D_F16, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
IO_TYPE(D_BF16, D_F32, D_BF16)
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
IO_TYPE(D_F32, D_F32, D_F32)
IO_TYPE(D_F32, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_F32)
IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
IO_TYPE(D_BF16, D_F32, D_BF16)
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
END_IO_TYPE_DECL(RMS_NORM)
if (!VALIDATE_OP_IO_TYPES(RMS_NORM, self, inputs, self->input.num, outputs, self->output.num))
{

View File

@ -25,6 +25,7 @@
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
@ -776,7 +777,7 @@ static vsi_status op_optimize
/* Only forward run stride_slice's optimize */
if ( direction == VSI_NN_OPTIMIZE_BACKWARD ||
!self->graph->ctx->options.enable_slice_optimize )
!((vsi_nn_graph_prv_t*)(self->graph))->options->enable_slice_optimize )
{
return status;
}

View File

@ -78,9 +78,10 @@ static vsi_status _tile_op_compute
vsi_size_t new_rank = 0;
vsi_bool ret = FALSE;
uint32_t i = 0;
vsi_size_t* multiples = (vsi_size_t*)self->nn_param.tile.multiples;
int32_t* multiples_ = (int32_t*)self->nn_param.tile.multiples;
vsi_nn_tensor_t* temp_tensors[3] = { NULL };
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = {1};
int32_t multiples_value[VSI_NN_MAX_DIM_NUM] = {0};
vsi_nn_tensor_attr_t attr;
@ -101,6 +102,11 @@ static vsi_status _tile_op_compute
temp_tensors[2] = outputs[0];
}
for (i = 0; i < inputs[0]->attr.dim_num; i ++)
{
multiples[i] = (vsi_size_t)multiples_[i];
}
ret = vsi_nn_kernel_optimize_tile_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
multiples, inputs[0]->attr.dim_num,
@ -111,6 +117,7 @@ static vsi_status _tile_op_compute
{
if (_is_supported_axis(shapes[1], new_rank) == FALSE)
{
uint32_t _multiples = (uint32_t)(new_rank > 4 && shapes[1][4] > 1 ? 3 : 2);
reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\
shapes[0], (vsi_size_t)new_rank );
reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, temp_tensors[2],\
@ -125,8 +132,11 @@ static vsi_status _tile_op_compute
memcpy( &attr, &reshape_tensors[0]->attr, sizeof(attr));
attr.is_const = FALSE;
attr.vtl = TRUE;
attr.size[0] = reshape_tensors[2]->attr.size[0];
attr.size[1] = reshape_tensors[2]->attr.size[1];
for (i = 0; i < _multiples; i++)
{
attr.size[i] = reshape_tensors[2]->attr.size[i];
}
temp_tensors[0] = vsi_nn_CreateTensor( self->graph, &attr );
memset( &attr, 0 , sizeof(vsi_nn_tensor_attr_t) );
@ -136,9 +146,11 @@ static vsi_status _tile_op_compute
attr.size[0] = new_rank;
attr.dim_num = 1;
multiples_value[0] = (int32_t)shapes[1][0];
multiples_value[1] = (int32_t)shapes[1][1];
for (i = 0; i < new_rank; i++)
for (i = 0; i < _multiples; i++)
{
multiples_value[i] = (int32_t)shapes[1][i];
}
for (i = _multiples; i < new_rank; i++)
{
multiples_value[i] = 1;
}
@ -150,9 +162,11 @@ static vsi_status _tile_op_compute
goto final;
}
multiples_value[0] = 1;
multiples_value[1] = 1;
for (i = 0; i < new_rank; i++)
for (i = 0; i < _multiples; i++)
{
multiples_value[i] = 1;
}
for (i = _multiples; i < new_rank; i++)
{
multiples_value[i] = (int32_t)shapes[1][i];
}
@ -257,6 +271,7 @@ static vsi_bool op_check
IO_TYPE(D_F32, D_F32)
IO_TYPE(D_F32, D_U8|Q_ASYM)
IO_TYPE(D_F16, D_U8|Q_ASYM)
IO_TYPE(D_BOOL8, D_BOOL8)
END_IO_TYPE_DECL(TILE)
if (!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) {
char* desc = generate_op_io_types_desc(inputs,

View File

@ -471,6 +471,10 @@ static _op_param_gen_t s_op_gen[] =
/* TAN */ NULL,
/* RMSNORM */ NULL,
/* SHAPE */ NULL,
/* BITCAST */ NULL,
/* GROUPED_CONV3D */ NULL,
/* COL2IM */ NULL,
/* L1_LAYER_NORM */ NULL,
};
_compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );

View File

@ -772,6 +772,7 @@ vsi_bool vsi_nn_CreateTensorGroup
end[1] = in_tensor->attr.size[1];
end[2] = in_tensor->attr.size[2];
end[3] = in_tensor->attr.size[3];
end[4] = in_tensor->attr.size[4];
end[axis] = 0;
for( i = 0; i < group_number; i ++ )
{
@ -1259,6 +1260,32 @@ vsi_bool vsi_nn_is_same_quant_type(
}
break;
}
#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: {
const float diff = (float)1e-5;
int32_t i = 0;
int32_t scale_cnt0 = src_dtype->group_count;
int32_t scale_cnt1 = dst_dtype->group_count;
int32_t group_size0 = src_dtype->group_size;
int32_t group_size1 = dst_dtype->group_size;
if (scale_cnt0 == scale_cnt1 && group_size0 == group_size1)
{
const float* src_scale_ptr = src_dtype->group_scales;
const float* dst_scale_ptr = dst_dtype->group_scales;
for (i = 0; i < scale_cnt0; i++)
{
if (vsi_nn_float_compare(
src_scale_ptr[i], dst_scale_ptr[i], diff) == FALSE)
{
return FALSE;
}
}
} else {
return FALSE;
}
break;
}
#endif
default:
break;
}

View File

@ -22,10 +22,10 @@
*
*****************************************************************************/
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_test.h"
#include "vsi_nn_context.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_types.h"
static vsi_status query_hardware_caps
(
@ -103,6 +103,9 @@ static const char* ENV_ENABLE_STREAM_PROCESSOR = "vendor.VSI_VX_ENABLE_STREAM_PR
static const char* ENV_FORCE_RGB888_OUT_NHWC = "vendor.VSI_NN_FORCE_RGB888_OUT_NHWC";
static const char* ENV_ENABLE_SLICE_OPTIMIZE = "vendor.VSI_NN_ENABLE_SLICE_OPTIMIZE";
static const char* ENV_ENABLE_BATCH_OPT = "vendor.VSI_VX_ENABLE_BATCH_OPT";
static const char* ENV_SAVE_FILE_TYPE = "vendor.VSI_SAVE_FILE_TYPE";
static const char* VSI_USE_IMAGE_PROCESS = "vendor.VSI_USE_IMAGE_PROCESS";
static const char* VSI_USE_FROM_HANDLE = "vendor.VSI_USE_FROM_HANDLE";
#else
static const char* ENV_ENABLE_SHADER = "VIV_VX_ENABLE_SHADER";
static const char* ENV_ENABLE_OPCHECK = "VSI_NN_ENABLE_OPCHECK";
@ -113,8 +116,11 @@ static const char* ENV_ENABLE_STREAM_PROCESSOR = "VSI_VX_ENABLE_STREAM_PROCESSOR
static const char* ENV_FORCE_RGB888_OUT_NHWC = "VSI_NN_FORCE_RGB888_OUT_NHWC";
static const char* ENV_ENABLE_SLICE_OPTIMIZE = "VSI_NN_ENABLE_SLICE_OPTIMIZE";
static const char* ENV_ENABLE_BATCH_OPT = "VSI_VX_ENABLE_BATCH_OPT";
static const char* ENV_SAVE_FILE_TYPE = "VSI_SAVE_FILE_TYPE";
static const char* VSI_USE_IMAGE_PROCESS = "VSI_USE_IMAGE_PROCESS";
static const char* VSI_USE_FROM_HANDLE = "VSI_USE_FROM_HANDLE";
#endif
static vsi_status vsi_nn_initOptions
vsi_status vsi_nn_initOptions
(
vsi_nn_runtime_option_t *options
)
@ -129,7 +135,7 @@ static vsi_status vsi_nn_initOptions
default_value = 1;
#endif
options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value);
options->enable_asymi8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
options->enable_i8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1);
options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1);
options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0);
@ -140,6 +146,9 @@ static vsi_status vsi_nn_initOptions
#endif
options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value);
options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0);
options->enable_save_file_type = vsi_nn_getenv_asint(ENV_SAVE_FILE_TYPE, 0);
options->enable_use_image_process = vsi_nn_getenv_asint(VSI_USE_IMAGE_PROCESS, -1);
options->enable_use_from_handle = vsi_nn_getenv_asint(VSI_USE_FROM_HANDLE, -1);
return VSI_SUCCESS;
}

View File

@ -1354,20 +1354,26 @@ vsi_nn_graph_t * vsi_nn_CreateGraph
graph->node_num = 0;
graph->ctx = ctx;
graph->rnn_wksp = NULL;
((vsi_nn_graph_prv_t*) graph)->options =
(vsi_nn_runtime_option_t *)malloc( sizeof( vsi_nn_runtime_option_t ));
CHECK_PTR_FAIL_GOTO(((vsi_nn_graph_prv_t*) graph)->options, "Create graph options fail.", error);
graph->node_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) );
graph->tensor_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) );
graph->isAllowFastMode = TRUE;
vsi_nn_MapInit( graph->node_table );
vsi_nn_MapInit( graph->tensor_table );
vsi_nn_initOptions( ((vsi_nn_graph_prv_t*) graph)->options );
}
else
{
VSILOGE( "Create vx graph fail." );
free( graph );
free(graph);
graph = NULL;
}
}
return graph;
error:
return graph;
} /* vsi_nn_CreateGraph() */
@ -1429,6 +1435,10 @@ void vsi_nn_ReleaseGraph
free( tmp );
}
}
if (NULL != ((vsi_nn_graph_prv_t*)ptr)->options)
{
free(((vsi_nn_graph_prv_t*)ptr)->options);
}
free( ptr );
*graph = NULL;
}
@ -1500,7 +1510,7 @@ vsi_status vsi_nn_SetupGraph
}
#if VX_GRAPH_BATCH_OPT_SUPPORT
if (graph->ctx->options.enable_batch_opt)
if (((vsi_nn_graph_prv_t*)graph)->options->enable_batch_opt)
{
/*processing batch splitting*/
status = batchInference_graph(graph, nodes_list);
@ -2064,7 +2074,7 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
const char * kernel_name
)
{
vsi_nn_node_t * node;
vsi_nn_node_prv_t* node;
vsi_nn_node_id_t id;
vsi_nn_op_proc_t * node_proc;
@ -2076,16 +2086,17 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
{
return NULL;
}
node = (vsi_nn_node_t *)malloc( sizeof( vsi_nn_node_t ) );
node = (vsi_nn_node_prv_t*)malloc(sizeof(vsi_nn_node_prv_t));
if( NULL != node )
{
memset( node, 0, sizeof( vsi_nn_node_t ) );
node->graph = graph;
node->op = op;
node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO;
node->vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR;
memset(node, 0, sizeof(vsi_nn_node_prv_t));
node->pon.graph = graph;
node->pon.op = op;
node->pon.vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
node->pon.vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO;
node->pon.vx_param.down_scale_size_rounding =
VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR;
/* init op */
if(node_proc->init != NULL){
@ -2093,31 +2104,31 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
}
/* init output struct */
node->output.num = node_proc->output_num;
node->output.tensors = (vsi_nn_tensor_id_t *) malloc(
node->pon.output.num = node_proc->output_num;
node->pon.output.tensors = (vsi_nn_tensor_id_t*)malloc(
node_proc->output_num * sizeof( vsi_nn_tensor_id_t ) );
if ( NULL == node->output.tensors )
if (NULL == node->pon.output.tensors)
{
VSILOGE("Create output tensor id %s. fail", vsi_nn_OpGetName(op));
vsi_nn_safe_free(node);
return NULL;
}
vsi_nn_InitTensorsId( node->output.tensors, node_proc->output_num );
vsi_nn_InitTensorsId(node->pon.output.tensors, node_proc->output_num);
/* init input struct */
node->input.num = node_proc->input_num;
node->input.tensors = (vsi_nn_tensor_id_t *) malloc(
node->pon.input.num = node_proc->input_num;
node->pon.input.tensors = (vsi_nn_tensor_id_t*)malloc(
node_proc->input_num * sizeof( vsi_nn_tensor_id_t ) );
if ( NULL == node->input.tensors )
if (NULL == node->pon.input.tensors)
{
VSILOGE("Create input tensor id %s. fail", vsi_nn_OpGetName(op));
vsi_nn_safe_free(node->output.tensors);
vsi_nn_safe_free(node->pon.output.tensors);
vsi_nn_safe_free(node);
return NULL;
}
vsi_nn_InitTensorsId( node->input.tensors, node_proc->input_num );
node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
node->attr.enable_op_constraint_check = TRUE;
vsi_nn_InitTensorsId(node->pon.input.tensors, node_proc->input_num);
node->pon.attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
node->pon.attr.enable_op_constraint_check = TRUE;
}
id = graph->cur_nid;
if(NULL != node){
@ -2126,7 +2137,7 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
graph->cur_nid ++;
}
vsi_nn_OpRegisterExternalOvxInit(op, kernel_name, node_proc);
return node;
return (vsi_nn_node_t*)node;
} /* vsi_nn_AddExternalNode() */
void vsi_nn_RemoveNode
@ -3354,24 +3365,245 @@ final:
return status;
} /* vsi_nn_ExecuteGraphLoop() */
typedef enum {
VSI_NN_ENABLE_I8TOU8 = 0,
VSI_NN_ENABLE_OPCHECK,
VSI_SAVE_FILE_TYPE,
VSI_USE_IMAGE_PROCESS,
VSI_NN_LOG_LEVEL,
VSI_NN_ENABLE_CONCAT_OPTIMIZE,
VSI_NN_ENABLE_DATACONVERT_OPTIMIZE,
VSI_VX_ENABLE_STREAM_PROCESSOR,
VSI_NN_FORCE_RGB888_OUT_NHWC,
VSI_NN_ENABLE_SLICE_OPTIMIZE,
VSI_VX_ENABLE_BATCH_OPT,
VIV_VX_ENABLE_SHADER,
VSI_USE_FROM_HANDLE,
VIV_VX_ENABLE_GRAPH_TRANSFORM
} VSI_PUBLIC_TYPE vsi_nn_runtime_variable;
vsi_status vsi_nn_SetGraphTransformOption
typedef struct {
const char* key;
int32_t value;
} VSI_PUBLIC_TYPE keyValuePair;
char* vsi_nn_GetRunTimeVariable
(
const vsi_nn_graph_t* graph,
const char* key
)
{
int32_t isVaid = 1;
int32_t value = -1;
#define varSize 256
char* value_str = (char*)malloc(sizeof(char) * varSize);
CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final);
memset(value_str, 0, varSize);
char tmp_value[varSize] = {0};
VSI_UNREFERENCED(tmp_value);
vsi_nn_runtime_option_t* options = ((vsi_nn_graph_prv_t*)graph)->options;
switch (vsi_nn_GetVariable(key))
{
case VIV_VX_ENABLE_SHADER:
value =options->enable_shader;
break;
case VSI_NN_ENABLE_OPCHECK:
value = options->enable_opcheck;
break;
case VSI_NN_ENABLE_I8TOU8:
value = options->enable_i8_to_u8;
break;
case VSI_VX_ENABLE_STREAM_PROCESSOR:
value = options->enable_stream_processor;
break;
case VSI_VX_ENABLE_BATCH_OPT:
value = options->enable_batch_opt;
break;
case VSI_NN_FORCE_RGB888_OUT_NHWC:
value = options->enable_rgb88_planar_nhwc;
break;
case VSI_SAVE_FILE_TYPE:
value = options->enable_save_file_type;
break;
case VSI_NN_ENABLE_CONCAT_OPTIMIZE:
value = options->enable_concat_optimize;
break;
case VSI_NN_ENABLE_SLICE_OPTIMIZE:
value = options->enable_slice_optimize;
break;
case VSI_USE_IMAGE_PROCESS:
if (options->enable_use_image_process != -1)
{
value = options->enable_use_image_process;
}
else
{
isVaid = 0;
}
break;
case VSI_USE_FROM_HANDLE:
if (options->enable_use_from_handle != -1)
{
value = options->enable_use_from_handle;
}
else
{
isVaid = 0;
}
break;
default:
isVaid = 0;
VSILOGE("Not support this key: %s.", key);
}
if (isVaid == 1)
{
snprintf(tmp_value, varSize, "%d", value);
memcpy(value_str, tmp_value, varSize);
} else
{
goto final;
}
#undef varSize
return value_str;
final:
#undef varSize
vsi_nn_safe_free(value_str);
return value_str;
}
vsi_status vsi_nn_SetRunTimeVariable
(
vsi_nn_graph_t* graph,
const char* ctrl_str,
size_t size
const char* key,
const char* value
)
{
vsi_status status = VSI_SUCCESS;
size_t size = 1; // placeholder, not used in vxSetGraphAttribute.
if (graph == NULL)
{
status = VSI_FAILURE;
return status;
}
vsi_nn_runtime_option_t* options = ((vsi_nn_graph_prv_t*)graph)->options;
VSI_UNREFERENCED(size);
if (vsi_nn_getenv(key) == NULL)
{
switch (vsi_nn_GetVariable(key) )
{
case VIV_VX_ENABLE_SHADER:
options->enable_shader = atoi(value);
break;
case VSI_NN_ENABLE_OPCHECK:
options->enable_opcheck = atoi(value);
break;
case VSI_NN_ENABLE_I8TOU8:
options->enable_i8_to_u8 = atoi(value);
break;
case VSI_VX_ENABLE_STREAM_PROCESSOR:
options->enable_stream_processor = atoi(value);
break;
case VSI_VX_ENABLE_BATCH_OPT:
options->enable_batch_opt = atoi(value);
break;
case VSI_NN_FORCE_RGB888_OUT_NHWC:
options->enable_rgb88_planar_nhwc = atoi(value);
break;
case VSI_NN_ENABLE_CONCAT_OPTIMIZE:
options->enable_concat_optimize = atoi(value);
break;
case VSI_NN_ENABLE_DATACONVERT_OPTIMIZE:
options->enable_dataconvert_optimize = atoi(value);
break;
case VSI_NN_ENABLE_SLICE_OPTIMIZE:
options->enable_slice_optimize = atoi(value);
break;
case VSI_SAVE_FILE_TYPE:
options->enable_save_file_type = atoi(value);
break;
case VSI_USE_IMAGE_PROCESS:
options->enable_use_image_process = atoi(value);
break;
case VSI_USE_FROM_HANDLE:
options->enable_use_from_handle = atoi(value);
break;
case VIV_VX_ENABLE_GRAPH_TRANSFORM:
#ifdef VX_GRAPH_TRANSFORM_OPTION_SUPPORT
if (graph && graph->g) {
status = vxSetGraphAttribute(
graph->g, VX_GRAPH_VSI_TRANSFORM_OPTIONS, value, size);
}
#else
status = VSI_FAILURE;
VSILOGE("VX_GRAPH_TRANSFORM_OPTION_SUPPORT is not defined, please check driver version.");
#endif
break;
default:
#ifdef VX_GRAPH_ENV_SUPPORT
status = vxSetGraphEnv(graph->g, key, value);
#else
status = VSI_FAILURE;
VSILOGE("VX_GRAPH_ENV_SUPPORT is not defined, please check driver version.");
#endif
break;
}
}
return status;
}
int32_t vsi_nn_GetVariable(const char* variableKey) {
keyValuePair dict[] = {
{"VSI_NN_ENABLE_I8TOU8", VSI_NN_ENABLE_I8TOU8},
{"VSI_NN_ENABLE_OPCHECK", VSI_NN_ENABLE_OPCHECK},
{"VSI_SAVE_FILE_TYPE", VSI_SAVE_FILE_TYPE},
{"VSI_USE_IMAGE_PROCESS", VSI_USE_IMAGE_PROCESS},
{"VSI_NN_ENABLE_CONCAT_OPTIMIZE", VSI_NN_ENABLE_CONCAT_OPTIMIZE},
{"VSI_NN_ENABLE_DATACONVERT_OPTIMIZE", VSI_NN_ENABLE_DATACONVERT_OPTIMIZE},
{"VSI_VX_ENABLE_STREAM_PROCESSOR", VSI_VX_ENABLE_STREAM_PROCESSOR},
{"VSI_NN_FORCE_RGB888_OUT_NHWC", VSI_NN_FORCE_RGB888_OUT_NHWC},
{"VSI_NN_ENABLE_SLICE_OPTIMIZE", VSI_NN_ENABLE_SLICE_OPTIMIZE},
{"VSI_VX_ENABLE_BATCH_OPT", VSI_VX_ENABLE_BATCH_OPT},
{"VIV_VX_ENABLE_SHADER", VIV_VX_ENABLE_SHADER},
{"VSI_USE_FROM_HANDLE", VSI_USE_FROM_HANDLE},
{"VIV_VX_ENABLE_GRAPH_TRANSFORM", VIV_VX_ENABLE_GRAPH_TRANSFORM},
{NULL, -1}
};
for (int32_t i = 0; dict[i].key != NULL; i++) {
if (strcmp(dict[i].key, variableKey) == 0) {
return dict[i].value;
}
}
return -1;
}
OVXLIB_API char* vsi_nn_GenerateGraphJson
(
vsi_nn_graph_t* graph
)
{
char* json = NULL;
VSI_UNREFERENCED(graph);
#ifdef VX_GENERATE_GRAPH_JSON_API_SUPPORT
if (graph && graph->g)
{
json = vxGenerateGraphJson(graph->g);
}
#endif
return json;
}
OVXLIB_API vsi_status vsi_nn_ReleaseGraphJson
(
char* json
)
{
vsi_status status = VSI_FAILURE;
VSI_UNREFERENCED(graph);
VSI_UNREFERENCED(ctrl_str);
VSI_UNREFERENCED(size);
#ifdef VX_GRAPH_TRANSFORM_OPTION_SUPPORT
if(graph && graph->g)
{
status = vxSetGraphAttribute(graph->g, VX_GRAPH_VSI_TRANSFORM_OPTIONS, ctrl_str, size);
VSI_UNREFERENCED(json);
#ifdef VX_GENERATE_GRAPH_JSON_API_SUPPORT
if (json) {
status = vxReleaseGraphJson(json);
}
#endif
return status;
}

View File

@ -26,6 +26,7 @@
#include "vsi_nn_graph_optimization.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
@ -37,14 +38,50 @@ static vsi_bool _is_asymm_int8_norm_tensor
{
vsi_bool ret = FALSE;
ret = ( tensor != NULL
&& tensor->attr.vtl == FALSE && tensor->attr.is_const == FALSE
&& tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
&& tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
ret = ( tensor != NULL &&
tensor->attr.vtl == FALSE &&
tensor->attr.is_const == FALSE &&
tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
);
return ret;
}/* _is_asymm_int8_norm_tensor() */
static vsi_bool _is_symm_int8_norm_tensor
(
vsi_nn_tensor_t* tensor
)
{
vsi_bool ret = FALSE;
ret = (tensor != NULL &&
tensor->attr.vtl == FALSE &&
tensor->attr.is_const == FALSE &&
tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
);
return ret;
}/* _is_symm_int8_norm_tensor() */
static vsi_bool _is_int8_norm_tensor
(
vsi_nn_graph_t* graph,
vsi_nn_tensor_t* tensor
)
{
vsi_bool ret = FALSE;
vsi_bool support_symi8 =
((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
ret = _is_asymm_int8_norm_tensor(tensor);
ret = ret || (support_symi8 && _is_symm_int8_norm_tensor(tensor));
return ret;
}/* _is_int8_norm_tensor() */
static vsi_bool _is_asymm_int8_const_tensor
(
vsi_nn_tensor_t * tensor
@ -52,14 +89,47 @@ static vsi_bool _is_asymm_int8_const_tensor
{
vsi_bool ret = FALSE;
ret = ( tensor != NULL
&& tensor->attr.is_const == TRUE
&& tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
&& tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
ret = ( tensor != NULL &&
tensor->attr.is_const == TRUE &&
tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
);
return ret;
}/* _is_asymm_int8_const_tensor() */
static vsi_bool _is_symm_int8_const_tensor
(
vsi_nn_tensor_t* tensor
)
{
vsi_bool ret = FALSE;
ret = (tensor != NULL &&
tensor->attr.is_const == TRUE &&
tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
);
return ret;
}/* _is_symm_int8_const_tensor() */
static vsi_bool _is_int8_const_tensor
(
vsi_nn_graph_t* graph,
vsi_nn_tensor_t* tensor
)
{
vsi_bool ret = FALSE;
vsi_bool support_symi8 =
((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
ret = _is_asymm_int8_const_tensor(tensor);
ret = ret || (support_symi8 && _is_symm_int8_const_tensor(tensor));
return ret;
}/* _is_int8_const_tensor() */
static vsi_bool _is_asymm_int8_virtual_tensor
(
vsi_nn_tensor_t * tensor
@ -67,14 +137,47 @@ static vsi_bool _is_asymm_int8_virtual_tensor
{
vsi_bool ret = FALSE;
ret = ( tensor != NULL
&& tensor->attr.vtl == TRUE
&& tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
&& tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
ret = ( tensor != NULL &&
tensor->attr.vtl == TRUE &&
tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
);
return ret;
}/* _is_asymm_int8_virtual_tensor() */
static vsi_bool _is_symm_int8_virtual_tensor
(
vsi_nn_tensor_t* tensor
)
{
vsi_bool ret = FALSE;
ret = (tensor != NULL &&
tensor->attr.vtl == TRUE &&
tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
);
return ret;
}/* _is_symm_int8_virtual_tensor() */
static vsi_bool _is_int8_virtual_tensor
(
vsi_nn_graph_t* graph,
vsi_nn_tensor_t* tensor
)
{
vsi_bool ret = FALSE;
vsi_bool support_symi8 =
((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
ret = _is_asymm_int8_virtual_tensor(tensor);
ret = ret || (support_symi8 && _is_symm_int8_virtual_tensor(tensor));
return ret;
}/* _is_int8_virtual_tensor() */
static vsi_status _add_forward_node
(
vsi_nn_graph_t* graph,
@ -199,7 +302,7 @@ static void _get_graph_input_asymm_int8_norm_tensor
vsi_nn_tensor_id_t id = node->input.tensors[j];
vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
if (_is_asymm_int8_norm_tensor(tensor))
if (_is_int8_norm_tensor(graph, tensor))
{
if(tensor_ids != NULL)
{
@ -251,7 +354,7 @@ static void _get_graph_output_asymm_int8_norm_tensor
vsi_nn_tensor_id_t id = node->output.tensors[j];
vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
if (_is_asymm_int8_norm_tensor(tensor))
if (_is_int8_norm_tensor(graph, tensor))
{
if(tensor_ids != NULL)
{
@ -360,6 +463,7 @@ static vsi_status _add_graph_dataconvert_for_int8
{
memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
attr.dtype.zero_point += 128;
attr.vtl = TRUE;
output = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
@ -383,6 +487,7 @@ static vsi_status _add_graph_dataconvert_for_int8
{
memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
attr.dtype.zero_point += 128;
attr.vtl = TRUE;
input = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
@ -788,6 +893,7 @@ static void _convert_const_I8toU8
}
attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
attr->dtype.zero_point += 128;
if ( tensor->t ) vxReleaseTensor(&tensor->t);
@ -818,7 +924,7 @@ static vsi_status _convert_graph_const_tensor
vsi_nn_tensor_id_t id = node->input.tensors[j];
vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
if (_is_asymm_int8_const_tensor(tensor))
if (_is_int8_const_tensor(graph, tensor))
{
_convert_const_I8toU8(graph, id);
}
@ -835,11 +941,9 @@ static vsi_status _convert_virtual_tensor_attr
vsi_nn_tensor_t * tensor
)
{
if (_is_asymm_int8_virtual_tensor(tensor))
{
tensor->attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
tensor->attr.dtype.zero_point += 128;
}
tensor->attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
tensor->attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
tensor->attr.dtype.zero_point += 128;
return VSI_SUCCESS;
}/* _convert_virtual_tensor_attr() */
@ -849,7 +953,7 @@ static vsi_status _convert_graph_virtual_tensor
vsi_nn_graph_t* graph
)
{
vsi_status status = VSI_FAILURE;
vsi_status status = VSI_SUCCESS;
uint32_t node_num = graph->node_num;
vsi_nn_node_t* node = NULL;
uint32_t i = 0;
@ -865,7 +969,10 @@ static vsi_status _convert_graph_virtual_tensor
vsi_nn_tensor_id_t id = node->input.tensors[j];
vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
status = _convert_virtual_tensor_attr(tensor);
if (_is_int8_virtual_tensor(graph, tensor))
{
status = _convert_virtual_tensor_attr(tensor);
}
}
for(j = 0; j < node->output.num; j++)
@ -873,7 +980,10 @@ static vsi_status _convert_graph_virtual_tensor
vsi_nn_tensor_id_t id = node->output.tensors[j];
vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
status = _convert_virtual_tensor_attr(tensor);
if (_is_int8_virtual_tensor(graph, tensor))
{
status = _convert_virtual_tensor_attr(tensor);
}
}
}
@ -925,7 +1035,7 @@ vsi_status vsi_nn_OptimizeGraph
status = VSI_SUCCESS;
if (!nbg_flag && graph->ctx->options.enable_asymi8_to_u8)
if (!nbg_flag &&((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8)
{
status = _graph_optimization_convert_int8_to_uint8(graph, dirty);
CHECK_STATUS_FAIL_GOTO(status, final);

View File

@ -452,7 +452,8 @@ void vsi_nn_internal_init_tensor_attr
if( dtype->qnt_type == VSI_NN_QNT_TYPE_NONE &&
( dtype->vx_type != VSI_NN_TYPE_FLOAT16 &&
dtype->vx_type != VSI_NN_TYPE_FLOAT32 &&
dtype->vx_type != VSI_NN_TYPE_BFLOAT16 ) )
dtype->vx_type != VSI_NN_TYPE_BFLOAT16 &&
dtype->vx_type != VSI_NN_TYPE_INT32) )
{
attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
attr->dtype.vx_type = VSI_NN_TYPE_FLOAT16;

View File

@ -208,6 +208,10 @@ static _node_template s_template[] =
/* RESIZE_3D */ NULL,
/* REDUCEL2 */ NULL,
/* CROP_AND_RESIZE */ NULL,
/* BITCAST */ NULL,
/* GROUPED_CONV3D */ NULL,
/* CO2IM */ NULL,
/* L1_LAYER_NORM */ NULL,
};
//_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c );

View File

@ -26,6 +26,7 @@
#include "vsi_nn_client_op.h"
#include "vsi_nn_node.h"
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
@ -281,7 +282,7 @@ vsi_bool vsi_nn_OpCheck
if ( NULL != proc )
{
ret = TRUE;
if ( proc->check && node->graph->ctx->options.enable_opcheck)
if ( proc->check && ((vsi_nn_graph_prv_t*)(node->graph))->options->enable_opcheck)
{
ret = proc->check( node, inputs, outputs );
}

View File

@ -144,6 +144,17 @@ static void print_tensor
tensor->attr.dtype.scale_dim);
ext_attr[count] = 0;
break;
#endif
#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
count = snprintf(&ext_attr[0],
_EXT_ATTR_BUF_SZ,
"SYM GPTQ axis=%d, count=%d, group_size=%d",
tensor->attr.dtype.group_channel_dim,
tensor->attr.dtype.group_count,
tensor->attr.dtype.group_size);
ext_attr[count] = 0;
break;
#endif
default:
vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
@ -430,6 +441,25 @@ static vsi_bool _init_tensor
VSILOGE(
"can't support qnt_type "
"VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC.");
#endif
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_GROUP;
// This is a hack that driver doesn't support const scales
scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
params.quant_data.affinePerGroup.scales = scales;
params.quant_data.affinePerGroup.zero_points = NULL;
params.quant_data.affinePerGroup.zero_point_group_count = 0;
break;
#else
VSILOGE(
"can't support qnt_type "
"VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC.");
#endif
default:
break;

View File

@ -58,6 +58,7 @@ typedef struct _vsi_nn_graph_prv
// Add graph internal attribute here...
vsi_nn_swap_handle_cache_t swap_handle_cache;
vsi_nn_runtime_option_t* options;
} vsi_nn_graph_prv_t;
/** Internal Node structure, internal use only. */