Update internal to 1.1.88 release (#657)

Internal ovxlib SHA 32fe479af5549e894bcd40de5740ae0dfd42bdb9

Type: Code Improvement

Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
Chen Feiyue 2023-11-03 13:16:33 +08:00 committed by GitHub
parent 10081790ee
commit 1bb1e070f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
120 changed files with 11472 additions and 1753 deletions

View File

@ -194,3 +194,4 @@ DEF_OP(INVERSE_SIGMOID)
DEF_OP(GRID_SAMPLE) DEF_OP(GRID_SAMPLE)
DEF_OP(LPNORM) DEF_OP(LPNORM)
DEF_OP(RESIZE_3D) DEF_OP(RESIZE_3D)
DEF_OP(REDUCEL2)

View File

@ -35,7 +35,7 @@ typedef struct _vsi_nn_deconv_param
uint32_t ksize[2]; uint32_t ksize[2];
uint32_t stride[2]; uint32_t stride[2];
/* Pad left, right, top, bottom */ /* Pad left, right, top, bottom */
uint32_t pad[4]; int32_t pad[4];
/* Pad type default value shall be AUTO */ /* Pad type default value shall be AUTO */
uint32_t pad_type; uint32_t pad_type;
uint32_t weights; uint32_t weights;

View File

@ -44,6 +44,7 @@ typedef struct _vsi_nn_max_pool3d_param
uint32_t pad[6]; uint32_t pad[6];
/* Pad type default value shall be AUTO */ /* Pad type default value shall be AUTO */
vsi_nn_pad_e pad_type; vsi_nn_pad_e pad_type;
uint32_t dilation[3];
} vsi_nn_max_pool3d_param; } vsi_nn_max_pool3d_param;
_compiler_assert(offsetof(vsi_nn_max_pool3d_param, local) == 0, \ _compiler_assert(offsetof(vsi_nn_max_pool3d_param, local) == 0, \
vsi_nn_max_pool3d_h ); vsi_nn_max_pool3d_h );

View File

@ -30,11 +30,20 @@
extern "C" { extern "C" {
#endif #endif
typedef struct _vsi_nn_moments_lcl_data
{
vsi_bool use_internal_node;
uint32_t perm[VSI_NN_MAX_DIM_NUM];
int32_t axis[VSI_NN_MAX_DIM_NUM];
} vsi_nn_moments_lcl_data;
typedef struct _vsi_nn_moments_param typedef struct _vsi_nn_moments_param
{ {
const int32_t* axis; const int32_t* axis;
int32_t axis_num; int32_t axis_num;
vsi_bool keep_dim; vsi_bool keep_dim;
vsi_nn_moments_lcl_data *lcl_data;
} vsi_nn_moments_param; } vsi_nn_moments_param;
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -50,6 +50,7 @@ typedef struct _vsi_nn_pool_param
vsi_nn_pad_e pad_type; vsi_nn_pad_e pad_type;
/* poolwithargmax layer local data structure */ /* poolwithargmax layer local data structure */
vsi_nn_pool_lcl_data *local; vsi_nn_pool_lcl_data *local;
uint32_t dilation[2];
} vsi_nn_pool_param; } vsi_nn_pool_param;
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -0,0 +1,47 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_REDUCEL2_H
#define _VSI_NN_OP_REDUCEL2_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_reducel2_param
{
struct _reducel2_local_data_t * lcl;
vx_int32 *axis;
vx_uint32 axis_num;
vx_bool keep_dim;
} vsi_nn_reducel2_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -466,54 +466,109 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) { static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
float fp8_f32 = in / scale; float fp8_f32 = in / scale;
int32_t fp8_i32 = *((int32_t*)&fp8_f32); int32_t in_val = *((int32_t*)&fp8_f32);
//int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 3) - 1));
int32_t eps = 1 << (23 - 3 - 1);
fp8_i32 += eps;
//fp8_i32 &= mask;
{
int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1;
int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff;
int expShiftValue = FLOAT8_E4M3_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT;
int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7;
exp = (exp + expShiftValue) & 0xF; uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
uint32_t in_man = (in_val & 0x7FFFFF); /* low 23 bits is man */
return (uint8_t)(sign << 7 | exp << 3 | mantissa); uint32_t out_sign = in_sign;
int32_t out_exp = (in_exp + FLOAT8_E4M3_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT); /* in_exp - fp32bias + SE4M3 bias */
uint32_t man_rounding = 0, out_man = 0, out_val = 0;
man_rounding = (in_man + 0x80000) >> 20; /* manrounding is 3 bits */
if (((man_rounding >> 3) && 0x1) == 1) {
/* when in_man like 0b11_1, exp += 1, mantissa is 0*/
out_exp += 1;
} }
/* Clamp Denorm to zero */
if (out_exp <= 0) {
out_exp = 0;
man_rounding = 0;
out_sign = 0;
}
out_man = man_rounding & 0x7; /* keep low 3 bits of man */
/* overflow policy */
if (out_exp >= 16 || (out_exp == 15 && out_man == 7)) {
out_exp = 15;
out_man = 6;
#if 0
if (mode == VX_CONVERT_POLICY_SATURATE) {
out_exp = 15;
out_man = 6;
} else if (mode == VX_CONVERT_POLICY_INF) {
out_exp = 15;
out_man = 7;
} else {
vxmASSERT(0 && "Error overflow mode!\n");
}
#endif
}
out_val = (out_sign << 7) | (out_exp << 3) | out_man;
return (uint8_t)(out_val & 0xFF);
} /* fp32_to_fp8_e4m3() */ } /* fp32_to_fp8_e4m3() */
static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) { static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) {
float fp8_f32 = in / scale; float fp8_f32 = in / scale;
int32_t fp8_i32 = *((int32_t*)&fp8_f32); int32_t in_val = *((int32_t*)&fp8_f32);
//int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 2) - 1)); uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
int32_t eps = 1 << (23 - 2 - 1); uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
fp8_i32 += eps; uint32_t in_man = (in_val & 0x7FFFFF); /* low 23 bits is man */
//fp8_i32 &= mask;
{
int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1;
int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff;
int expShiftValue = FLOAT8_E5M2_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT;
int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x3;
exp = (exp + expShiftValue) & 0x1F; uint32_t out_sign = in_sign;
int32_t out_exp = (in_exp + FLOAT8_E5M2_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT); /* in_exp - fp32bias + SE5M2 bias */
uint32_t man_rounding = 0, out_man = 0, out_val = 0;
return (uint8_t)(sign << 7 | exp << 2 | mantissa); man_rounding = (in_man + 0x100000) >> 21; /* manrounding is 2 bits */
if (((man_rounding >> 2) && 0x1) == 1) {
/* when in_man like 0b11, exp += 1, mantissa is 0*/
out_exp += 1;
} }
/* Clamp Denorm to zero */
if (out_exp <= 0) {
out_exp = 0;
man_rounding = 0;
out_sign = 0;
}
out_man = man_rounding & 0x3; /* keep low 9 bits of man */
/* overflow policy */
if (out_exp >= 31) {
out_exp = 30;
out_man = 3;
#if 0
if (mode == VX_CONVERT_POLICY_SATURATE) {
out_exp = 30;
out_man = 3;
} else if (mode == VX_CONVERT_POLICY_INF) {
out_exp = 31;
out_man = 0;
} else {
vxmASSERT(0 && "Error overflow mode!\n");
}
#endif
}
out_val = (out_sign << 7) | (out_exp << 2) | out_man;
return (uint8_t)(out_val & 0xFF);
} /* fp32_to_fp8_e5m2() */ } /* fp32_to_fp8_e5m2() */
static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) { static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
float val_fp32; float val_fp32;
uint32_t signOut = 0; uint32_t signOut = 0;
uint32_t exponentOut = 0; uint32_t exponentOut = 0;
uint32_t mantissaOut = 0; uint32_t mantissaOut = 0;
uint32_t out_u = 0; uint32_t out_u = 0;
{
uint32_t signIn; uint32_t signIn;
uint32_t exponentIn; uint32_t exponentIn;
uint32_t mantissaIn; uint32_t mantissaIn;
int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E4M3_BIAS_EXPONENT; uint32_t expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E4M3_BIAS_EXPONENT;
//uint32_t i = 0;
//uint32_t intMsk = 0x4;
signIn = (in >> (FLOAT8_E4M3_EXPONENT_SIZE + FLOAT8_E4M3_MANTISSA_SIZE)) & 0x1; signIn = (in >> (FLOAT8_E4M3_EXPONENT_SIZE + FLOAT8_E4M3_MANTISSA_SIZE)) & 0x1;
exponentIn = (in >> FLOAT8_E4M3_MANTISSA_SIZE) & 0xF; exponentIn = (in >> FLOAT8_E4M3_MANTISSA_SIZE) & 0xF;
@ -521,13 +576,30 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
signOut = signIn; signOut = signIn;
if (exponentIn == 0 && mantissaIn == 0) /* clamp subnorm*/
{ if (exponentIn == 0) {
goto final; goto final;
} }
/*
if (exponentIn == 0xf && mantissaIn == 0x7) if (exponentIn == 0 && mantissaIn == 0)
{ {
break;
}
else if (exponentIn == 0)
{
while (!(mantissaIn & intMsk))
{
intMsk >>= 1;
++i;
}
exponentOut = (exponentIn + expShiftValue - i) & 0xff;
mantissaIn = ((mantissaIn ^ intMsk) << (i + 1));
mantissaOut = (mantissaIn << (FLOAT_MATISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff;
break;
}
*/
if (exponentIn == 0xf && mantissaIn == 0x7) {
exponentOut = 0xff; exponentOut = 0xff;
mantissaOut = 0x400000; mantissaOut = 0x400000;
goto final; goto final;
@ -535,8 +607,7 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
exponentOut = (exponentIn + expShiftValue) & 0xff; exponentOut = (exponentIn + expShiftValue) & 0xff;
mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff; mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff;
}
final: final:
out_u = signOut << 31 | exponentOut << 23 | mantissaOut; out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
val_fp32 = *((float*)&out_u); val_fp32 = *((float*)&out_u);
@ -546,44 +617,60 @@ final:
static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) { static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
float val_fp32; float val_fp32;
uint32_t signOut = 0; uint32_t signOut = 0;
uint32_t exponentOut = 0; uint32_t exponentOut = 0;
uint32_t mantissaOut = 0; uint32_t mantissaOut = 0;
uint32_t out_u = 0; uint32_t out_u = 0;
{
uint32_t signIn; uint32_t signIn;
uint32_t exponentIn; uint32_t exponentIn;
uint32_t mantissaIn; uint32_t mantissaIn;
int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E5M2_BIAS_EXPONENT; uint32_t expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E5M2_BIAS_EXPONENT;
//uint32_t i = 0;
//uint32_t intMsk = 0x2;
signIn = (in >> 7) & 0x1; signIn = (in >> (FLOAT8_E5M2_EXPONENT_SIZE + FLOAT8_E5M2_MANTISSA_SIZE)) & 0x1;
exponentIn = (in >> 2) & 0x1F; exponentIn = (in >> FLOAT8_E5M2_MANTISSA_SIZE) & 0x1F;
mantissaIn = in & 0x3; mantissaIn = in & 0x3;
signOut = signIn; signOut = signIn;
if (exponentIn == 0 && mantissaIn == 0) /* clamp subnorm*/
{ if (exponentIn == 0) {
goto final; goto final;
} }
/*
if (exponentIn == 0x1f && mantissaIn == 0x3) if (exponentIn == 0 && mantissaIn == 0)
{ {
break;
}
else if (exponentIn == 0)
{
while (!(mantissaIn & intMsk))
{
intMsk >>= 1;
++i;
}
exponentOut = (exponentIn + expShiftValue - i) & 0xff;
mantissaIn = ((mantissaIn ^ intMsk) << (i + 1));
mantissaOut = (mantissaIn << (FLOAT_MATISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff;
break;
}
*/
if (exponentIn == 0x1f && mantissaIn == 0x3) {
exponentOut = 0xff; exponentOut = 0xff;
mantissaOut = 0x400000; mantissaOut = 0x400000;
goto final; goto final;
} }
exponentOut = (exponentIn + expShiftValue) & 0xff; exponentOut = (exponentIn + expShiftValue) & 0xff;
mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff; mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff;
}
final:
final:
out_u = signOut << 31 | exponentOut << 23 | mantissaOut; out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
val_fp32 = *((float*)&out_u); val_fp32 = *((float*)&out_u);
return val_fp32 * scale; return val_fp32 * scale;
} /* fp8_e5m2_to_fp32() */ } /* fp8_e5m2_to_fp32() */

View File

@ -241,7 +241,7 @@ OVXLIB_API vsi_status vsi_nn_VerifyGraph
*/ */
OVXLIB_API vsi_status vsi_nn_RunGraph OVXLIB_API vsi_status vsi_nn_RunGraph
( (
const vsi_nn_graph_t * graph vsi_nn_graph_t * graph
); );
/** /**
@ -556,7 +556,7 @@ OVXLIB_API vsi_bool vsi_nn_SetGraphOutputs
* @param[in] graph Graph handle * @param[in] graph Graph handle
* @param[in] id Node id to be removed. * @param[in] id Node id to be removed.
*/ */
void vsi_nn_RemoveNode OVXLIB_API void vsi_nn_RemoveNode
( (
vsi_nn_graph_t * graph, vsi_nn_graph_t * graph,
vsi_nn_node_id_t id vsi_nn_node_id_t id
@ -788,6 +788,14 @@ OVXLIB_API vsi_status vsi_nn_ExecuteGraphLoop
vsi_nn_graph_t* graph, vsi_nn_graph_t* graph,
vsi_nn_tensor_t *max_iteration_tensor vsi_nn_tensor_t *max_iteration_tensor
); );
OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption
(
vsi_nn_graph_t* graph,
const char* ctrl_str,
size_t size
);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -39,7 +39,7 @@ vx_tensor vsi_nn_CreateRawTensorFromData
vsi_nn_tensor_attr_t * attr vsi_nn_tensor_attr_t * attr
); );
vsi_status vsi_nn_OptimizeGraph OVXLIB_API vsi_status vsi_nn_OptimizeGraph
( (
vsi_nn_graph_t* graph, vsi_nn_graph_t* graph,
vsi_bool *dirty vsi_bool *dirty

View File

@ -208,6 +208,7 @@
#include "ops/vsi_nn_op_grid_sample.h" #include "ops/vsi_nn_op_grid_sample.h"
#include "ops/vsi_nn_op_lpnorm.h" #include "ops/vsi_nn_op_lpnorm.h"
#include "ops/vsi_nn_op_resize_3d.h" #include "ops/vsi_nn_op_resize_3d.h"
#include "ops/vsi_nn_op_reducel2.h"
/* custom node head define define */ /* custom node head define define */
#include "custom/vsi_nn_custom_node_type.h" #include "custom/vsi_nn_custom_node_type.h"
#include "ops/vsi_nn_op_inverse_sigmoid.h" #include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -404,6 +405,7 @@ typedef union _vsi_nn_nn_param
vsi_nn_grid_sample_param gridsample; vsi_nn_grid_sample_param gridsample;
vsi_nn_lpnorm_param lpnorm; vsi_nn_lpnorm_param lpnorm;
vsi_nn_resize_3d_param resize_3d; vsi_nn_resize_3d_param resize_3d;
vsi_nn_reducel2_param reducel2;
void* client_param; void* client_param;
/* custom node data struct define */ /* custom node data struct define */

View File

@ -268,7 +268,7 @@ vsi_status vsi_nn_OpOptimize
* *
* @return VSI_SUCCESS on success, or error code otherwise. * @return VSI_SUCCESS on success, or error code otherwise.
*/ */
vsi_bool vsi_nn_OpCheck OVXLIB_API vsi_bool vsi_nn_OpCheck
( (
vsi_nn_op_t op, vsi_nn_op_t op,
vsi_nn_node_t * node, vsi_nn_node_t * node,

View File

@ -264,6 +264,14 @@ OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
uint32_t enable_nodes_count uint32_t enable_nodes_count
); );
OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParamForCropOnly
(
vsi_nn_graph_t* graph,
vsi_nn_node_id_t* enable_nodes,
vsi_bool* crop_set_start_only,
uint32_t enable_nodes_count
);
OVXLIB_API vsi_status vsi_nn_UpdateCropParamsForBinaryGraph OVXLIB_API vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
( (
vsi_nn_graph_t* graph, vsi_nn_graph_t* graph,

View File

@ -614,6 +614,13 @@ OVXLIB_API vsi_status vsi_nn_SwapTensorHandle
vsi_nn_tensor_t * tensor1 vsi_nn_tensor_t * tensor1
); );
OVXLIB_API vsi_status vsi_nn_SwapTensorHandleWithCache
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t * tensor0,
vsi_nn_tensor_t * tensor1
);
OVXLIB_API vsi_size_t vsi_nn_vxGetTensorElementNum OVXLIB_API vsi_size_t vsi_nn_vxGetTensorElementNum
( (
vsi_nn_tensor_attr_t *attr vsi_nn_tensor_attr_t *attr

View File

@ -33,7 +33,7 @@ extern "C"{
#define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MAJOR 1
#define VSI_NN_VERSION_MINOR 1 #define VSI_NN_VERSION_MINOR 1
#define VSI_NN_VERSION_PATCH 84 #define VSI_NN_VERSION_PATCH 88
#define VSI_NN_VERSION \ #define VSI_NN_VERSION \
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

View File

@ -42,7 +42,7 @@ __BEGIN_DECLS
/* /*
* Define kernel meta. * Define kernel meta.
*/ */
#define _INPUT_NUM (1) #define _INPUT_NUM (2)
#define _OUTPUT_NUM (1) #define _OUTPUT_NUM (1)
#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM) #define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.custom_warp_affine") #define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.custom_warp_affine")
@ -54,6 +54,7 @@ __BEGIN_DECLS
static vx_param_description_t _custom_warp_affine_kernel_param_def[] = static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
{ {
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
@ -66,8 +67,9 @@ static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
// Add kererl parameters here // Add kererl parameters here
}; };
#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def ) #define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def )
#define SCALAR_INPUT_TYPE (2) #define SCALAR_INPUT_TYPE (3)
#define SCALAR_MATRIX_OFFSET (3) #define SCALAR_MATRIX_OFFSET (4)
#define SCALAR_INPUT_RGB_TYPE (10)
static void _transform_affine static void _transform_affine
( (
@ -142,44 +144,60 @@ DEF_KERNEL_EXECUTOR(_compute)
tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
/* alloc the float32 data buffer */ /* alloc the float32 data buffer */
buffer[1] = (float *)malloc(out_elements * sizeof(float));
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
memset(buffer[1], 0, out_elements * sizeof(float));
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
if (tensors[1])
{
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
}
buffer[2] = (float *)malloc(out_elements * sizeof(float));
CHECK_PTR_FAIL_GOTO( buffer[2], "Create input buffer fail.", final );
memset(buffer[2], 0, out_elements * sizeof(float));
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE], status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE],
&type); &type);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rgb_type); status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_RGB_TYPE], &rgb_type);
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
for (i = 0; i < 6; i++) for (i = 0; i < 6; i++)
{
if (buffer[1] == NULL)
{ {
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
&matrix[i]); &matrix[i]);
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
} }
else
width = attr[1]->shape->data[0];
height = attr[1]->shape->data[1];
for(i = 2; i < (vsi_size_t)attr[1]->shape->size; ++i)
{ {
outer_size *= attr[1]->shape->data[i]; matrix[i] = buffer[1][i];
}
}
width = attr[2]->shape->data[0];
height = attr[2]->shape->data[1];
for(i = 2; i < (vsi_size_t)attr[2]->shape->size; ++i)
{
outer_size *= attr[2]->shape->data[i];
} }
// Do something // Do something
for (b = 0; b < outer_size; b++) for (b = 0; b < outer_size; b++)
{ {
float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1]; float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1];
float *dst_base = buffer[1] + b * width * height; float *dst_base = buffer[2] + b * width * height;
if ( rgb_type == VSI_NN_WARP_AFFINE_TYPE_RGB ) if ( rgb_type == VSI_NN_WARP_AFFINE_TYPE_RGB )
{ {
@ -274,8 +292,8 @@ DEF_KERNEL_EXECUTOR(_compute)
} }
} }
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
buffer[1], out_elements ); buffer[2], out_elements );
CHECK_STATUS_FAIL_GOTO( status, final ); CHECK_STATUS_FAIL_GOTO( status, final );
final: final:
for( i = 0; i < _CPU_IO_NUM; i ++ ) for( i = 0; i < _CPU_IO_NUM; i ++ )
@ -350,7 +368,7 @@ static vsi_nn_kernel_node_t _setup
node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create( node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
graph, F32, &buffer[i] ); graph, F32, &buffer[i] );
} }
node_params[9] = vsi_nn_kernel_scalar_create( node_params[SCALAR_INPUT_RGB_TYPE] = vsi_nn_kernel_scalar_create(
graph, I32, &rgb_type ); graph, I32, &rgb_type );
/* Pass parameters to node. */ /* Pass parameters to node. */
@ -360,7 +378,7 @@ static vsi_nn_kernel_node_t _setup
{ {
vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
} }
vsi_nn_kernel_scalar_release( &node_params[9] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_RGB_TYPE] );
} }
} }
return node; return node;

View File

@ -49,29 +49,52 @@ typedef enum _custom_warp_affine_type_e
bilinear = VSI_NN_INTERPOLATION_BILINEAR, bilinear = VSI_NN_INTERPOLATION_BILINEAR,
}custom_warp_affine_type_e; }custom_warp_affine_type_e;
#define _CUSTOM_WARP_AFFINE_2D_KERNEL_SOURCE "custom_warp_affine_2d"
#define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE "custom_warp_affine" #define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE "custom_warp_affine"
#define _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE "custom_warp_affine_rgb" #define _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE "custom_warp_affine_rgb"
#define _CUSTOM_WARP_AFFINE_OPTIONAL_KERNEL_SOURCE "custom_warp_affine_optional"
#define _CUSTOM_WARP_AFFINE_RGB_OPTIONAL_KERNEL_SOURCE "custom_warp_affine_rgb_optional"
// Add kernel hashtable here // Add kernel hashtable here
#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D, RGB_TYPE ) \ #define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D, RGB_TYPE, OPTIONAL_INTPUT ) \
(( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20) | (RGB_TYPE << 24)) (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20) | \
(RGB_TYPE << 24) | (OPTIONAL_INTPUT << 28))
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0 ), \ { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0, 0 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \
_CUSTOM_WARP_AFFINE_KERNEL_SOURCE } _CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
#define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ #define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0 ), \ { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0, 0 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \
_CUSTOM_WARP_AFFINE_KERNEL_SOURCE } _CUSTOM_WARP_AFFINE_2D_KERNEL_SOURCE }
#define PACK_OPTIONAL_INPUT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0, 1 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_optional_input"), \
_CUSTOM_WARP_AFFINE_OPTIONAL_KERNEL_SOURCE }
#define PACK_OPTIONAL_INPUT_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0, 1 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D_optional_input"), \
_CUSTOM_WARP_AFFINE_OPTIONAL_KERNEL_SOURCE }
#define PACK_RGB_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ #define PACK_RGB_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1 ), \ { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1, 0 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb"), \ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb"), \
_CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE } _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE }
#define PACK_RGB_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ #define PACK_RGB_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1 ), \ { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1, 0 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_2D"), \ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_2D"), \
_CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE } _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE }
#define PACK_OPTIONAL_INPUT_RGB_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1, 1 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_optional_input"), \
_CUSTOM_WARP_AFFINE_RGB_OPTIONAL_KERNEL_SOURCE }
#define PACK_RGB_2D_OPTIONAL_INPUT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1, 1 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_2D_optional_input"), \
_CUSTOM_WARP_AFFINE_RGB_OPTIONAL_KERNEL_SOURCE }
typedef struct typedef struct
{ {
uint32_t key; uint32_t key;
@ -84,15 +107,23 @@ static const _kernel_map_type _custom_warp_affine_kernel_map[] =
// Register kernel here // Register kernel here
PACK_KERNEL_MAP( U8, U8, nearest_neighbor ), PACK_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_KERNEL_MAP( U8, U8, bilinear ), PACK_KERNEL_MAP( U8, U8, bilinear ),
PACK_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, bilinear ),
PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ), PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_2D_KERNEL_MAP( U8, U8, bilinear ), PACK_2D_KERNEL_MAP( U8, U8, bilinear ),
PACK_OPTIONAL_INPUT_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_OPTIONAL_INPUT_2D_KERNEL_MAP( U8, U8, bilinear ),
PACK_RGB_KERNEL_MAP( U8, U8, nearest_neighbor ), PACK_RGB_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_RGB_KERNEL_MAP( U8, U8, bilinear ), PACK_RGB_KERNEL_MAP( U8, U8, bilinear ),
PACK_OPTIONAL_INPUT_RGB_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_OPTIONAL_INPUT_RGB_KERNEL_MAP( U8, U8, bilinear ),
PACK_RGB_2D_KERNEL_MAP( U8, U8, nearest_neighbor ), PACK_RGB_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_RGB_2D_KERNEL_MAP( U8, U8, bilinear ), PACK_RGB_2D_KERNEL_MAP( U8, U8, bilinear ),
PACK_RGB_2D_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_RGB_2D_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, bilinear ),
}; };
/* /*
@ -110,8 +141,21 @@ static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here // Add kererl parameters here
}; };
#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def )
#define SCALAR_MATRIX_OFFSET (2) static vx_param_description_t _custom_warp_affine_optinal_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_optinal_kernel_param_def )
/* /*
* Kernel initializer * Kernel initializer
*/ */
@ -138,17 +182,21 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
float matrix1[4] = {0}; float matrix1[4] = {0};
float matrix4[4] = {0}; float matrix4[4] = {0};
int32_t i = 0; int32_t i = 0;
uint32_t scalar_matrix_offset = 3;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[param_size - 7] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
if (param_size == 8)
{
scalar_matrix_offset = 2;
}
for (i = 0; i < 6; i++) for (i = 0; i < 6; i++)
{ {
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[scalar_matrix_offset + i],
&m[i]); &m[i]);
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
} }
@ -170,6 +218,8 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
/ gpu_param.global_scale[1]); / gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
if (param_size == 8)
{
status = vsi_nn_kernel_gpu_add_param( node, status = vsi_nn_kernel_gpu_add_param( node,
"matrix0", &matrix0 ); "matrix0", &matrix0 );
status |= vsi_nn_kernel_gpu_add_param( node, status |= vsi_nn_kernel_gpu_add_param( node,
@ -177,6 +227,7 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
status |= vsi_nn_kernel_gpu_add_param( node, status |= vsi_nn_kernel_gpu_add_param( node,
"matrix4", &matrix4 ); "matrix4", &matrix4 );
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param ); status = vsi_nn_kernel_gpu_config( node, &gpu_param );
@ -217,17 +268,21 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_rgb_initializer)
float matrix0[4] = {0}; float matrix0[4] = {0};
float matrix1[4] = {0}; float matrix1[4] = {0};
int32_t i = 0; int32_t i = 0;
uint32_t scalar_matrix_offset = 3;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[param_size - 7] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
if (param_size == 8)
{
scalar_matrix_offset = 2;
}
for (i = 0; i < 6; i++) for (i = 0; i < 6; i++)
{ {
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[scalar_matrix_offset + i],
&m[i]); &m[i]);
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
} }
@ -248,11 +303,14 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_rgb_initializer)
/ gpu_param.global_scale[1]); / gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
if (param_size == 8)
{
status = vsi_nn_kernel_gpu_add_param( node, status = vsi_nn_kernel_gpu_add_param( node,
"matrix0", &matrix0 ); "matrix0", &matrix0 );
status |= vsi_nn_kernel_gpu_add_param( node, status |= vsi_nn_kernel_gpu_add_param( node,
"matrix1", &matrix1 ); "matrix1", &matrix1 );
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param ); status = vsi_nn_kernel_gpu_config( node, &gpu_param );
@ -280,7 +338,8 @@ static vsi_status _query_kernel
vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs, vsi_nn_tensor_t * const * const outputs,
int32_t type, int32_t type,
int32_t rgb_type int32_t rgb_type,
int32_t optional_input
) )
{ {
vsi_status status = VSI_FAILURE; vsi_status status = VSI_FAILURE;
@ -289,6 +348,7 @@ static vsi_status _query_kernel
const _kernel_map_type * kernel_map = _custom_warp_affine_kernel_map; const _kernel_map_type * kernel_map = _custom_warp_affine_kernel_map;
size_t kernel_map_size = _cnt_of_array( _custom_warp_affine_kernel_map ); size_t kernel_map_size = _cnt_of_array( _custom_warp_affine_kernel_map );
vx_param_description_t * param_def = _custom_warp_affine_kernel_param_def; vx_param_description_t * param_def = _custom_warp_affine_kernel_param_def;
size_t param_def_size = _cnt_of_array( _custom_warp_affine_kernel_param_def );
vx_kernel_initialize_f initializer = _custom_warp_affine_initializer; vx_kernel_initialize_f initializer = _custom_warp_affine_initializer;
int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1; int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1;
uint32_t key = 0; uint32_t key = 0;
@ -297,7 +357,12 @@ static vsi_status _query_kernel
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img, rgb_type ); if (optional_input == 1)
{
param_def = _custom_warp_affine_optinal_kernel_param_def;
param_def_size = _cnt_of_array(_custom_warp_affine_optinal_kernel_param_def);
}
key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img, rgb_type, optional_input );
if (rgb_type == 1) if (rgb_type == 1)
{ {
initializer = _custom_warp_affine_rgb_initializer; initializer = _custom_warp_affine_rgb_initializer;
@ -313,7 +378,7 @@ static vsi_status _query_kernel
{ {
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def; kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _custom_warp_affine_kernel_param_def ); kernel->info.numParams = (vx_uint32)param_def_size;
kernel->info.initialize = initializer; kernel->info.initialize = initializer;
// Register code source // Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
@ -348,13 +413,23 @@ static vsi_nn_kernel_node_t _setup
int32_t type = vsi_nn_kernel_param_get_int32( params, "type"); int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type"); int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type");
float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size ); float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
int32_t optional_input = 1;
uint32_t scalar_matrix_offset = 3;
uint32_t param_num = _CUSTOM_WARP_AFFINE_PARAM_NUM;
if (inputs[1] == NULL)
{
optional_input = 0;
input_num = 1;
scalar_matrix_offset = scalar_matrix_offset - 1;
param_num = param_num - 1;
}
if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
{ {
return NULL; return NULL;
} }
status = _query_kernel( kernel, inputs, outputs, type, rgb_type ); status = _query_kernel( kernel, inputs, outputs, type, rgb_type, optional_input );
if ( VSI_SUCCESS == status) if ( VSI_SUCCESS == status)
{ {
node = vsi_nn_kernel_create_node( graph, kernel ); node = vsi_nn_kernel_create_node( graph, kernel );
@ -364,19 +439,20 @@ static vsi_nn_kernel_node_t _setup
border.mode = VX_BORDER_CONSTANT; border.mode = VX_BORDER_CONSTANT;
/* Set inputs and outputs */ /* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM, vsi_nn_kernel_node_pack_io( node_params, param_num,
inputs, input_num, outputs, output_num ); inputs, input_num, outputs, output_num );
for (i = 0; i < buffer_size; i++) for (i = 0; i < buffer_size; i++)
{ {
node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create( node_params[scalar_matrix_offset + i] = vsi_nn_kernel_scalar_create(
graph, F32, &buffer[i] ); graph, F32, &buffer[i] );
} }
/* Pass parameters to node. */ /* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM ); status = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
for (i = 0; i < buffer_size; i++) for (i = 0; i < buffer_size; i++)
{ {
vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] ); vsi_nn_kernel_scalar_release( &node_params[scalar_matrix_offset + i] );
} }
// Set default border mode. // Set default border mode.
border.constant_value.U32 = 0x00000000; border.constant_value.U32 = 0x00000000;
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );

View File

@ -42,7 +42,7 @@ typedef struct _custom_warp_affine_local_data_t {
/* /*
Declare number of input and output. Declare number of input and output.
*/ */
#define _INPUT_NUM (1) #define _INPUT_NUM (2)
#define _OUTPUT_NUM (1) #define _OUTPUT_NUM (1)
static vsi_status op_compute static vsi_status op_compute
@ -63,7 +63,7 @@ static vsi_status op_compute
self->n = (vx_node)vsi_nn_kernel_selector( self->graph, self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
"custom_warp_affine", "custom_warp_affine",
inputs, 1, inputs, 2,
outputs, 1, param ); outputs, 1, param );
vsi_nn_kernel_param_release( &param ); vsi_nn_kernel_param_release( &param );

View File

@ -269,7 +269,7 @@ static vsi_nn_kernel_node_t _setup
ret = vsi_nn_kernel_optimize_element_shape( ret = vsi_nn_kernel_optimize_element_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank); inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);
if ( ret ) if ( !ret )
{ {
return NULL; return NULL;
} }

View File

@ -297,21 +297,13 @@ static vsi_nn_kernel_node_t _setup
if (axis < 0) if (axis < 0)
{ {
axis_new = 0; axis += (int32_t)inputs[0]->attr.dim_num;
shapes[0][0] = 1;
shapes[0][1] = 1;
for (i = 0; i < inputs[0]->attr.dim_num; i++)
{
shapes[0][0] *= inputs[0]->attr.size[i];
} }
rs_dim = 2;
}
else
{
vsi_nn_kernel_optimize_softmax_shape( vsi_nn_kernel_optimize_softmax_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
shapes[0], &rs_dim, &axis_new); shapes[0], &rs_dim, &axis_new);
}
if (rs_dim > 3) if (rs_dim > 3)
{ {
return NULL; return NULL;

View File

@ -327,19 +327,40 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_node_t node = NULL;
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" ); int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t block_size = 1;
int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); int32_t block_num = 1;
int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); int32_t axis_num = 0;
int32_t indices_num = vsi_nn_kernel_param_get_int32( params, "indices_num" ); int32_t indices_num = 1;
int32_t is_batch = batch_dims > 0 ? 1 : 0; int32_t is_batch = batch_dims > 0 ? 1 : 0;
vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3; vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3;
int32_t is_array = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0; int32_t is_array = 0;
int32_t i = 0; uint32_t i = 0;
vsi_size_t *input_size = inputs[0]->attr.size;
uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num;
uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : inputs[1]->attr.dim_num;
VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(output_num);
for (i = 0; i < (uint32_t)axis; ++i)
{
block_size *= (int32_t)input_size[i];
}
axis_num = (int32_t)input_size[axis];
for (i = axis + 1; i < r_rank - batch_dims; ++i)
{
block_num *= (int32_t)input_size[i];
}
for (i = 0; i < q_rank - batch_dims; ++i)
{
indices_num *= (int32_t)inputs[1]->attr.size[i];
}
is_array = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0;
status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array); status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array);
status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array); status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array); status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array);

View File

@ -60,8 +60,13 @@ __BEGIN_DECLS
HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, F32, F32), \ HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, F32, F32), \
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
#define TENSOR_LOG_SOFTMAX_BFLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
#define HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, DST_TYPE) \ #define HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D") CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D")
#define TENSOR_LOG_SOFTMAX_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ #define TENSOR_LOG_SOFTMAX_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
@ -73,6 +78,11 @@ __BEGIN_DECLS
HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, F32, F32), \ HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, F32, F32), \
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
#define TENSOR_LOG_SOFTMAX_BFLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
static const struct { static const struct {
uint32_t key; uint32_t key;
char* function_name; char* function_name;
@ -85,11 +95,16 @@ static const struct {
TENSOR_LOG_SOFTMAX_FLOAT(0, F16, F16) TENSOR_LOG_SOFTMAX_FLOAT(0, F16, F16)
TENSOR_LOG_SOFTMAX_FLOAT(1, F16, F16) TENSOR_LOG_SOFTMAX_FLOAT(1, F16, F16)
TENSOR_LOG_SOFTMAX_FLOAT(2, F16, F16) TENSOR_LOG_SOFTMAX_FLOAT(2, F16, F16)
TENSOR_LOG_SOFTMAX_BFLOAT(0, BF16, BF16)
TENSOR_LOG_SOFTMAX_BFLOAT(1, BF16, BF16)
TENSOR_LOG_SOFTMAX_BFLOAT(2, BF16, BF16)
TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F32, F32) TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F32, F32)
TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F32, F32) TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F32, F32)
TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F16, F16) TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F16, F16)
TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F16, F16) TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F16, F16)
TENSOR_LOG_SOFTMAX_BFLOAT_2D(0, BF16, BF16)
TENSOR_LOG_SOFTMAX_BFLOAT_2D(1, BF16, BF16)
TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8) TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8)
TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8) TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8)

View File

@ -35,7 +35,6 @@
#include "vsi_nn_tensor_util.h" #include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h" #include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS __BEGIN_DECLS
@ -1572,8 +1571,8 @@ static vsi_nn_kernel_node_t _setup
if (outputs[LSTMUNIT_ACT_OUTPUT] && VSI_NN_TYPE_UINT8 == outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.vx_type) if (outputs[LSTMUNIT_ACT_OUTPUT] && VSI_NN_TYPE_UINT8 == outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.vx_type)
{ {
scale_val[8] = 1.0f / vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_OUTPUT]); scale_val[8] = 1.0f / vsi_nn_get_tensor_scale(outputs[LSTMUNIT_ACT_OUTPUT]);
tail_val[8] = (float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_OUTPUT]); tail_val[8] = (float)vsi_nn_get_tensor_zero_point(outputs[LSTMUNIT_ACT_OUTPUT]);
} }
if( VSI_SUCCESS == status) if( VSI_SUCCESS == status)

View File

@ -35,6 +35,7 @@
#include "utils/vsi_nn_util.h" #include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h" #include "kernel/vsi_nn_kernel_eltwise.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS __BEGIN_DECLS
@ -44,6 +45,7 @@ __BEGIN_DECLS
#define KERNEL_SOURCE_1 "matrixmul" #define KERNEL_SOURCE_1 "matrixmul"
#define KERNEL_SOURCE_2 "matrixmul_transA" #define KERNEL_SOURCE_2 "matrixmul_transA"
#define KERNEL_SOURCE_3 "matrixmul_cross" #define KERNEL_SOURCE_3 "matrixmul_cross"
#define KERNEL_SOURCE_4 "matrixmul_4x"
typedef enum typedef enum
{ {
@ -51,8 +53,9 @@ __BEGIN_DECLS
_3D _3D
} vsi_nn_kernel_image_dim_type_e; } vsi_nn_kernel_image_dim_type_e;
#define HASH_MATRIXMUL_KEY(_type0, _type1, _type2, _image_dim, _trans_a, _cross) \ #define HASH_MATRIXMUL_KEY(_type0, _type1, _type2, _image_dim, flag_4x, _trans_a, _cross) \
((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_image_dim << 4) | (_trans_a << 2) | (_cross)) ((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_image_dim << 6) | \
(flag_4x << 4) | (_trans_a << 2) | (_cross))
#define HASH_MATRIXMUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ #define HASH_MATRIXMUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
@ -66,23 +69,39 @@ __BEGIN_DECLS
#define HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ #define HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_merge") CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_merge")
#define HASH_MATRIXMUL_4X_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
CVIVANTE_NAMESPACE("cl.gemm_4x_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
#define HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
CVIVANTE_NAMESPACE("cl.gemm_4x_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
#define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0), \ { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0, 0), \
HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
SOURCE }, SOURCE },
#define TENSOR_MATRIXMUL_4X_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
{HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 0, 0), \
HASH_MATRIXMUL_4X_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
SOURCE },
#define TENSOR_MATRIXMUL_4X_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
{HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 1, 0), \
HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
SOURCE },
#define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ #define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 0), \ {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 1, 0), \
HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
SOURCE }, SOURCE },
#define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ #define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2, 0), \ {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 2, 0), \
HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
SOURCE }, SOURCE },
#define TENSOR_MATRIXMUL_MERGE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ #define TENSOR_MATRIXMUL_MERGE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 2), \ {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0, 2), \
HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
SOURCE }, SOURCE },
@ -121,6 +140,8 @@ static const struct {
TENSOR_MATRIXMUL_MERGE_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_3) TENSOR_MATRIXMUL_MERGE_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_3)
TENSOR_MATRIXMUL_MERGE_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_3) TENSOR_MATRIXMUL_MERGE_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_3)
TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_3) TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_3)
TENSOR_MATRIXMUL_4X_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4)
TENSOR_MATRIXMUL_4X_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4)
}; };
/* /*
@ -252,12 +273,53 @@ final:
return status; return status;
} /* _matrixmul_initializer() */ } /* _matrixmul_initializer() */
DEF_KERNEL_INITIALIZER(_matrixmul_4x_initializer)
(vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t* param,
size_t param_size) {
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
vsi_nn_kernel_tensor_attr_t* attr = NULL;
vsi_size_t width = 0;
vsi_size_t height = 0;
VSI_UNREFERENCED(param_size);
attr =
vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
CHECK_PTR_FAIL_GOTO(attr, "Create tensor attr buffer fail.", final);
width = attr->shape->data[0];
height = attr->shape->data[1];
gpu_param.dim = 2;
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];
gpu_param.global_size[1] = height;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
CHECK_STATUS_FAIL_GOTO(status, final);
final:
if (attr) {
vsi_nn_kernel_tensor_attr_release(&attr);
attr = NULL;
}
return status;
} /* _matrixmul_4x_initializer() */
static vsi_status _query_kernel static vsi_status _query_kernel
( (
vsi_nn_kernel_t * kernel, vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs, vsi_nn_tensor_t * const * const outputs,
vsi_size_t depth, vsi_size_t depth,
int32_t flag_4x,
int32_t transa, int32_t transa,
int32_t cross int32_t cross
) )
@ -317,7 +379,7 @@ static vsi_status _query_kernel
output_dtype = U8; output_dtype = U8;
} }
key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa, cross ); key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, flag_4x, transa, cross );
for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ ) for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ )
{ {
@ -340,7 +402,13 @@ static vsi_status _query_kernel
kernel->info.parameters = _matrixmul_merge_kernel_param_def; kernel->info.parameters = _matrixmul_merge_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def ); kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def );
} }
if (flag_4x) {
kernel->info.initialize = _matrixmul_4x_initializer;
} else {
kernel->info.initialize = _matrixmul_initializer; kernel->info.initialize = _matrixmul_initializer;
}
// Register code source // Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper", "eltwise_ops_helper",
@ -352,6 +420,8 @@ static vsi_status _query_kernel
} }
return status; return status;
} /* _query_kernel() */ } /* _query_kernel() */
static vsi_nn_kernel_node_t _setup static vsi_nn_kernel_node_t _setup
( (
vsi_nn_graph_t * graph, vsi_nn_graph_t * graph,
@ -368,8 +438,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_node_t node = NULL;
int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" ); int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" );
int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" );
int32_t cross_flg = vsi_nn_kernel_param_get_int32( params, "cross_flg" );
int32_t transFlg = 0; int32_t transFlg = 0;
int32_t flag_4x = 0;
vsi_size_t M = inputs[0]->attr.size[1]; vsi_size_t M = inputs[0]->attr.size[1];
vsi_size_t K = inputs[0]->attr.size[0]; vsi_size_t K = inputs[0]->attr.size[0];
vsi_size_t N = inputs[1]->attr.size[0]; vsi_size_t N = inputs[1]->attr.size[0];
@ -385,6 +455,22 @@ static vsi_nn_kernel_node_t _setup
float scale_out = vsi_nn_get_tensor_scale(outputs[0]); float scale_out = vsi_nn_get_tensor_scale(outputs[0]);
float zp_out = (float)vsi_nn_get_tensor_zero_point(outputs[0]); float zp_out = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
int32_t outer = 0; int32_t outer = 0;
vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
uint32_t final_rank = 0;
vsi_nn_tensor_t* rs_in_tensors = NULL;
vsi_nn_tensor_t* rs_out_tensors = NULL;
vsi_nn_tensor_t* final_in_tensors[2] = {NULL};
vsi_nn_tensor_t* final_out_tensors[1] = {NULL};
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e input1_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
uint32_t new_rank[3] = {0};
uint32_t cross_flg = 0;
uint32_t size_axis_in_out[3] = {0};
uint32_t stride_axis_in_out[9] = {0};
vsi_nn_tensor_t* tmp_inputs[2] = {NULL};
vsi_nn_tensor_t* tmp_outputs[1] = {NULL};
VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(output_num);
@ -397,6 +483,33 @@ static vsi_nn_kernel_node_t _setup
return NULL; return NULL;
} }
status = vsi_nn_kernel_optimize_matrixmul_broadcast_shape(
inputs[0]->attr.size,
inputs[1]->attr.size,
outputs[0]->attr.size,
inputs[0]->attr.dim_num,
inputs[1]->attr.dim_num,
outputs[0]->attr.dim_num,
shapes[0], shapes[1], shapes[2], new_rank,
&cross_flg, size_axis_in_out, stride_axis_in_out);
if (status)
{
tmp_inputs[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], new_rank[0]);
tmp_inputs[1] = vsi_nn_reshape_tensor(graph, inputs[1], shapes[1], new_rank[1]);
tmp_outputs[0] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[2], new_rank[2]);
M = tmp_inputs[0]->attr.size[1];
K = tmp_inputs[0]->attr.size[0];
N = tmp_inputs[1]->attr.size[0];
depth = tmp_outputs[0]->attr.dim_num > 2 ? tmp_outputs[0]->attr.size[2] : 1;
}
else
{
VSILOGE("illegal inputs shape");
status = VSI_FAILURE;
goto final;
}
if (transposeB) if (transposeB)
{ {
N = inputs[1]->attr.size[1]; N = inputs[1]->attr.size[1];
@ -410,8 +523,8 @@ static vsi_nn_kernel_node_t _setup
transFlg = 1; transFlg = 1;
} }
a_depth = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; a_depth = tmp_inputs[0]->attr.dim_num > 2 ? tmp_inputs[0]->attr.size[2] : 1;
b_depth = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1; b_depth = tmp_inputs[1]->attr.dim_num > 2 ? tmp_inputs[1]->attr.size[2] : 1;
if (b_depth == 1) if (b_depth == 1)
{ {
@ -422,14 +535,14 @@ static vsi_nn_kernel_node_t _setup
ac2zero = 1; ac2zero = 1;
} }
if (inputs[0]->attr.dim_num == 4 && inputs[1]->attr.dim_num == 3 if (tmp_inputs[0]->attr.dim_num == 4 && tmp_inputs[1]->attr.dim_num == 3
&& a_depth > 1 && b_depth > 1 && cross_flg == 2) && a_depth > 1 && b_depth > 1 && cross_flg == 2)
{ {
ac2zero = 1; ac2zero = 1;
bc2zero = 0; bc2zero = 0;
outer = (int32_t)a_depth; outer = (int32_t)a_depth;
} }
else if (inputs[1]->attr.dim_num == 4 && inputs[0]->attr.dim_num == 3 else if (tmp_inputs[1]->attr.dim_num == 4 && tmp_inputs[0]->attr.dim_num == 3
&& a_depth > 1 && b_depth > 1 && cross_flg == 2) && a_depth > 1 && b_depth > 1 && cross_flg == 2)
{ {
ac2zero = 0; ac2zero = 0;
@ -437,7 +550,46 @@ static vsi_nn_kernel_node_t _setup
outer = (int32_t)b_depth; outer = (int32_t)b_depth;
} }
status = _query_kernel( kernel, inputs, outputs, depth, transFlg, cross_flg ); final_in_tensors[0] = tmp_inputs[0];
final_in_tensors[1] = tmp_inputs[1];
final_out_tensors[0] = tmp_outputs[0];
input0_dtype = vsi_nn_kernel_map_dtype(tmp_inputs[0]->attr.dtype.vx_type);
input1_dtype = vsi_nn_kernel_map_dtype(tmp_inputs[1]->attr.dtype.vx_type);
output_dtype = vsi_nn_kernel_map_dtype(tmp_outputs[0]->attr.dtype.vx_type);
if (((transFlg == 0) || (transFlg == 1)) && (cross_flg == 0) &&
(F32 == input0_dtype) && (F32 == input1_dtype) && (F32 == output_dtype))
{
vsi_size_t in1_w = tmp_inputs[1]->attr.size[0];
vsi_size_t in1_h = tmp_inputs[1]->attr.size[1];
vsi_size_t in1_c = tmp_inputs[1]->attr.dim_num > 2 ? tmp_inputs[1]->attr.size[2] : 1;
vsi_size_t in1_n = tmp_inputs[1]->attr.dim_num > 3 ? tmp_inputs[1]->attr.size[3] : 1;
vsi_size_t out_w = tmp_outputs[0]->attr.size[0];
vsi_size_t out_h = tmp_outputs[0]->attr.size[1];
vsi_size_t out_c = tmp_outputs[0]->attr.dim_num > 2 ? tmp_outputs[0]->attr.size[2] : 1;
vsi_size_t out_n = tmp_outputs[0]->attr.dim_num > 3 ? tmp_outputs[0]->attr.size[3] : 1;
if ((in1_w == 1) && (in1_h % 4 == 0) && (in1_c == 1) && (in1_n == 1) &&
(out_w == 1) && (out_h % 4 == 0) && (out_c == 1) && (out_n == 1))
{
final_shape[0] = in1_h;
final_shape[1] = in1_w;
final_rank = 2;
rs_in_tensors = vsi_nn_reshape_tensor(graph, tmp_inputs[1], final_shape, final_rank);
final_in_tensors[1] = rs_in_tensors;
final_shape[0] = out_h;
final_shape[1] = out_w;
final_rank = 2;
rs_out_tensors = vsi_nn_reshape_tensor(graph, tmp_outputs[0], final_shape, final_rank);
final_out_tensors[0] = rs_out_tensors;
flag_4x = 1;
}
}
status = _query_kernel(kernel, tmp_inputs, tmp_outputs, depth, flag_4x, transFlg, cross_flg);
if ( VSI_SUCCESS == status) if ( VSI_SUCCESS == status)
{ {
node = vsi_nn_kernel_create_node( graph, kernel ); node = vsi_nn_kernel_create_node( graph, kernel );
@ -447,7 +599,7 @@ static vsi_nn_kernel_node_t _setup
size_t param_num = cross_flg == 2 ? _MATRIXMUL_MERGE_PARAM_NUM : _MATRIXMUL_PARAM_NUM; size_t param_num = cross_flg == 2 ? _MATRIXMUL_MERGE_PARAM_NUM : _MATRIXMUL_PARAM_NUM;
/* Pass parameters to node. */ /* Pass parameters to node. */
vsi_nn_kernel_node_pack_io( node_params, param_num, vsi_nn_kernel_node_pack_io( node_params, param_num,
inputs, 2, outputs, 1 ); final_in_tensors, 2, final_out_tensors, 1 );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N );
@ -483,6 +635,14 @@ static vsi_nn_kernel_node_t _setup
} }
} }
} }
final:
vsi_safe_release_tensor(tmp_inputs[0]);
vsi_safe_release_tensor(tmp_inputs[1]);
vsi_safe_release_tensor(tmp_outputs[0]);
vsi_safe_release_tensor(rs_in_tensors);
vsi_safe_release_tensor(rs_out_tensors);
return node; return node;
} /* _setup() */ } /* _setup() */

View File

@ -35,7 +35,8 @@
#include "vsi_nn_tensor_util.h" #include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h" #include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#include "utils/vsi_nn_dtype_util.h"
__BEGIN_DECLS __BEGIN_DECLS
@ -114,6 +115,7 @@ static const _kernel_map_type moments_map[] =
TENSOR_MOMENTS_TWO_AXIS_KERNELS(F32, F32, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(F32, F32, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS(I32, F32, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(I32, F32, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,F32, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,F32, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, F32, 1, 2, KERNEL_SOURCE_4)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F32, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F32, 0, 1, 2, KERNEL_SOURCE_5)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(F32, F32, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(F32, F32, 0, 1, 2, KERNEL_SOURCE_5)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(I32, F32, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(I32, F32, 0, 1, 2, KERNEL_SOURCE_5)
@ -140,63 +142,6 @@ static vx_param_description_t _moments_kernel_param_def[] =
}; };
#define _MOMENTS_PARAM_NUM _cnt_of_array( _moments_kernel_param_def ) #define _MOMENTS_PARAM_NUM _cnt_of_array( _moments_kernel_param_def )
static int32_t set_constant_border
(
vsi_nn_kernel_node_t node,
int32_t value
)
{
vsi_status status = VSI_FAILURE;
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
border.constant_value.S32 = value;
border.constant_value.U32 = (vx_uint32)value;
border.constant_value.S16 = (vx_int16)value;
border.constant_value.U8 = (vx_uint8)value;
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
return status;
}
static int32_t get_moments_output_reshape_size
(
vsi_nn_tensor_t ** outputs,
vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
int32_t* axis,
int32_t axis_num
)
{
uint32_t out_dims_num = outputs[0]->attr.dim_num;
vsi_size_t *output_size = outputs[0]->attr.size;
uint32_t i = 0;
int32_t out_rs_flg = 0;
for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
{
sizes[i] = 1;
}
sizes[3] = out_dims_num > 3 ? output_size[3] : 1;
if (axis_num == 1 && axis[0] == 0)
{
sizes[0] = output_size[1];
sizes[1] = out_dims_num > 2 ? output_size[2] : 1;
out_rs_flg = 1;
}
else if (axis_num == 1 && axis[0] == 1)
{
sizes[0] = output_size[0];
sizes[1] = out_dims_num > 2 ? output_size[2] : 1;
out_rs_flg = 1;
}
else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1)
{
sizes[0] = out_dims_num > 2 ? output_size[2] : 1;
out_rs_flg = 1;
}
return out_rs_flg;
} /* _get_moments_tensor_reshape_size */
/* /*
* Kernel initializer * Kernel initializer
*/ */
@ -247,26 +192,39 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
gpu_param.global_size[0] = gpu_align_p2((height + gpu_param.global_scale[0] - 1) gpu_param.global_size[0] = gpu_align_p2((height + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4); / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = chn; gpu_param.global_size[1] = chn;
gpu_param.global_size[2] = 1;
} }
else if (axis_num == 1 && axis == 1) else if (axis_num == 1 && axis == 1)
{ {
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4); / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = chn; gpu_param.global_size[1] = chn;
gpu_param.global_size[2] = 1;
} }
else if (axis_num == 1 && axis == 2) else if (axis_num == 1 && axis == 2)
{ {
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4); / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = height; gpu_param.global_size[1] = height;
gpu_param.global_size[2] = 1;
} }
else if (axis_num == 2) else if (axis_num == 2 && axis == 0)
{ {
gpu_param.local_size[0] = 16; gpu_param.local_size[0] = 16;
gpu_param.local_size[1] = 1; gpu_param.local_size[1] = 1;
gpu_param.local_size[2] = 1; gpu_param.local_size[2] = 1;
gpu_param.global_size[0] = 16; gpu_param.global_size[0] = 16;
gpu_param.global_size[1] = chn; gpu_param.global_size[1] = chn;
gpu_param.global_size[2] = 1;
}
else if (axis_num == 2 && axis == 1)
{
gpu_param.local_size[0] = 8;
gpu_param.local_size[1] = 8;
gpu_param.local_size[2] = 1;
gpu_param.global_size[0] = 8;
gpu_param.global_size[1] = 8;
gpu_param.global_size[2] = width;
} }
else if (axis_num == 3) else if (axis_num == 3)
{ {
@ -275,8 +233,8 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
gpu_param.local_size[2] = 1; gpu_param.local_size[2] = 1;
gpu_param.global_size[0] = 16; gpu_param.global_size[0] = 16;
gpu_param.global_size[1] = 1; gpu_param.global_size[1] = 1;
}
gpu_param.global_size[2] = 1; gpu_param.global_size[2] = 1;
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param ); status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final); CHECK_STATUS_FAIL_GOTO(status, final);
@ -366,117 +324,78 @@ static vsi_nn_kernel_node_t _setup
vsi_status status = VSI_FAILURE; vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_node_t node = NULL;
vsi_size_t out_shape[VSI_NN_MAX_DIM_NUM] = {0}; size_t axis_num = 0;
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num);
int32_t out_rs_flg = 0;
int32_t axis_num = 0;
size_t axis_num_temp = 0;
int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num_temp);
int32_t keep_dim = vsi_nn_kernel_param_get_int32( params, "keep_dim" );
int32_t first_axis = axis[0]; int32_t first_axis = axis[0];
int32_t i = 0; uint32_t i = 0;
vsi_nn_kernel_scalar_t scalar_list[INTERNAL_MOMENTS_SCALAR_NUM] = {NULL}; vsi_nn_kernel_scalar_t scalar_list[INTERNAL_MOMENTS_SCALAR_NUM] = {NULL};
vsi_nn_kernel_tensor_t reshape_tensors[3] = { NULL }; uint32_t axis_size = 0;
uint32_t rank_in = 0;
vsi_size_t width = inputs[0]->attr.size[0]; uint32_t rank_out = 0;
vsi_size_t height = inputs[0]->attr.size[1]; vsi_bool ret = FALSE;
vsi_size_t chn = inputs[0]->attr.size[2]; vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 1, 1, 1, 1 } };
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]); int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
float input_scale = vsi_nn_get_tensor_scale(inputs[0]); float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
float dim_ratio = (float)1.0 / (float)(width * height); float dim_ratio = 1;
VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(output_num);
axis_num = (int32_t)axis_num_temp; ret = vsi_nn_kernel_optimize_reduce_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
axis, (vsi_size_t)axis_num,
outputs[0]->attr.size, outputs[0]->attr.dim_num,
shapes[0], &rank_in, shapes[1], &rank_out,
new_axis, &axis_size);
if (axis_num == 1 && axis[0] == 0) if ( ret == FALSE || axis_size > 3 || (axis_size == 3 && new_axis[0] != 0))
{
dim_ratio = (float)1.0 / (float)(width);
}
else if (axis_num == 1 && axis[0] == 1)
{
dim_ratio = (float)1.0 / (float)(height);
}
else if (axis_num == 1 && axis[0] == 2)
{
dim_ratio = (float)1.0 / (float)(chn);
}
else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1)
{
dim_ratio = (float)1.0 / (float)(width * height);
}
else if (axis_num == 3)
{
dim_ratio = (float)1.0 / (float)(width * height * chn);
}
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{ {
return NULL; return NULL;
} }
if (keep_dim) reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], rank_in );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[1], rank_out );
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[1], shapes[1], rank_out );
first_axis = new_axis[0];
for ( i = 0; i < axis_size; i++ )
{ {
out_rs_flg = get_moments_output_reshape_size(&outputs[0], out_shape, axis, axis_num); dim_ratio = dim_ratio / (float)(shapes[0][new_axis[i]]);
} }
if (inputs[0]->attr.dim_num < 2) if ( !vsi_nn_kernel_gpu_check_shape( shapes[0], rank_in) )
{ {
shape[0] = inputs[0]->attr.size[0]; return NULL;
shape[1] = 1;
reshape_tensors[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 2 );
}
if (outputs[0]->attr.dim_num < 2)
{
shape[0] = outputs[0]->attr.size[0];
shape[1] = 1;
reshape_tensors[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 2 );
reshape_tensors[2] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, shape, 2 );
} }
scalar_list[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &first_axis ); scalar_list[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &first_axis );
scalar_list[AXIS_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); scalar_list[AXIS_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &axis_size );
scalar_list[ZP] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp ); scalar_list[ZP] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
scalar_list[SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); scalar_list[SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
scalar_list[WIDTH] = vsi_nn_kernel_scalar_create( graph, I32, &width ); scalar_list[WIDTH] = vsi_nn_kernel_scalar_create( graph, I32, &shapes[0][0] );
scalar_list[HEIGHT] = vsi_nn_kernel_scalar_create( graph, I32, &height ); scalar_list[HEIGHT] = vsi_nn_kernel_scalar_create( graph, I32, &shapes[0][1] );
scalar_list[CHN] = vsi_nn_kernel_scalar_create( graph, I32, &chn ); scalar_list[CHN] = vsi_nn_kernel_scalar_create( graph, I32, &shapes[0][2] );
scalar_list[DIMRATIO] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio ); scalar_list[DIMRATIO] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
status = _query_kernel( inputs, outputs, kernel, params, axis, axis_num, 0 ); status = _query_kernel( inputs, outputs, kernel, params, new_axis, axis_size, 0 );
if ( VSI_SUCCESS == status) if ( VSI_SUCCESS == status)
{ {
node = vsi_nn_kernel_create_node( graph, kernel ); node = vsi_nn_kernel_create_node( graph, kernel );
if ( node ) if ( node )
{ {
uint32_t index = 0; uint32_t index = 0;
int32_t constant_value = vsi_nn_get_tensor_zero_point(inputs[0]); vx_border_t border;
/* Pass parameters to node. */ /* Pass parameters to node. */
if (reshape_tensors[0]) node_params[index++] = reshape_tensors[0]->t;
{ node_params[index++] = reshape_tensors[1]->t;
node_params[index++] = reshape_tensors[0]; node_params[index++] = reshape_tensors[2]->t;
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t);
}
if (out_rs_flg)
{
node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, out_shape, 4 );
node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, out_shape, 4 );
}
else if (reshape_tensors[1])
{
node_params[index++] = reshape_tensors[1];
node_params[index++] = reshape_tensors[2];
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[0]->t);
node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[1]->t);
}
node_params[index++] = scalar_list[AXIS]; node_params[index++] = scalar_list[AXIS];
node_params[index++] = scalar_list[AXIS_NUM]; node_params[index++] = scalar_list[AXIS_NUM];
node_params[index++] = scalar_list[ZP]; node_params[index++] = scalar_list[ZP];
@ -487,29 +406,19 @@ static vsi_nn_kernel_node_t _setup
node_params[index++] = scalar_list[DIMRATIO]; node_params[index++] = scalar_list[DIMRATIO];
status = vsi_nn_kernel_node_pass_param( node, node_params, _MOMENTS_PARAM_NUM ); status = vsi_nn_kernel_node_pass_param( node, node_params, _MOMENTS_PARAM_NUM );
CHECK_STATUS(status); CHECK_STATUS(status);
if (out_rs_flg)
{
vsi_nn_kernel_tensor_release( &node_params[1] );
vsi_nn_kernel_tensor_release( &node_params[2] );
}
status = set_constant_border(node, constant_value); // Set default border mode.
border.mode = VX_BORDER_CONSTANT;
vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
CHECK_STATUS(status); CHECK_STATUS(status);
} }
} }
if (reshape_tensors[0]) vsi_safe_release_tensor(reshape_tensors[0]);
{ vsi_safe_release_tensor(reshape_tensors[1]);
vsi_nn_kernel_tensor_release( &reshape_tensors[0] ); vsi_safe_release_tensor(reshape_tensors[2]);
}
if (reshape_tensors[1])
{
vsi_nn_kernel_tensor_release( &reshape_tensors[1] );
}
if (reshape_tensors[2])
{
vsi_nn_kernel_tensor_release( &reshape_tensors[2] );
}
/* Pass parameters to node. */ /* Pass parameters to node. */
for( i = 0; i < INTERNAL_MOMENTS_SCALAR_NUM; i ++ ) for( i = 0; i < INTERNAL_MOMENTS_SCALAR_NUM; i ++ )
{ {

View File

@ -0,0 +1,318 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _MAXPOOL_KERNEL_SOURCE_NAME "maxpool"
typedef enum
{
_error = -1,
_MAX = 0,
_AVG
} vsi_nn_pool_type_e;
// Add kernel hashtable here
#define POOL_HASH_KEY( IN_DTYPE0, OUT_DTYPE, POOL_DTYPE ) \
(( IN_DTYPE0 << 16 ) | ( OUT_DTYPE << 8 ) | ( POOL_DTYPE ))
#define MAXPOOL_KERNELS( IN_DTYPE0, OUT_DTYPE ) \
{ POOL_HASH_KEY( IN_DTYPE0, OUT_DTYPE, _MAX ), \
CVIVANTE_NAMESPACE("cl.maxpool_"#IN_DTYPE0"to"#OUT_DTYPE), \
_MAXPOOL_KERNEL_SOURCE_NAME },
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type kernel_map[] =
{
// Register kernel here
MAXPOOL_KERNELS( I32, I32 )
MAXPOOL_KERNELS( U32, U32 )
MAXPOOL_KERNELS( F32, F32 )
MAXPOOL_KERNELS( U32, F32 )
MAXPOOL_KERNELS( F32, U32 )
};
/*
* Kernel params
*/
static vx_param_description_t _maxpool_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _MAXPOOL_PARAM_NUM _cnt_of_array( _maxpool_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_maxpool_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_status status = VSI_FAILURE;
vx_tensor output = (vx_tensor)param[1];
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
VSI_UNREFERENCED(param_size);
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
output_shape = output_attr->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = (output_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0];
gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1];
gpu_param.global_size[2] = (output_shape->data[2] + gpu_param.global_scale[2] - 1)
/ gpu_param.global_scale[2];
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (output_attr)
{
vsi_nn_kernel_tensor_attr_release(&output_attr);
}
return status;
} /* _maxpool_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t pool_type
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype = U8;
vsi_nn_kernel_dtype_e out_dtype = U8;
uint32_t key = 0;
uint32_t i = 0;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (in_dtype == U8)
{
in_dtype = U32;
}
else if (in_dtype == F16)
{
in_dtype = F32;
}
else if (in_dtype == I8 || in_dtype == I16)
{
in_dtype = I32;
}
if (out_dtype == U8)
{
out_dtype = U32;
}
else if (out_dtype == F16)
{
out_dtype = F32;
}
else if (out_dtype == I8 || out_dtype == I16)
{
out_dtype = I32;
}
key = POOL_HASH_KEY( in_dtype, out_dtype, pool_type );
for ( i = 0; i < (uint32_t)_cnt_of_array(kernel_map); i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)_cnt_of_array(kernel_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = _maxpool_kernel_param_def;
kernel->info.numParams = (uint32_t)_cnt_of_array(_maxpool_kernel_param_def);
kernel->info.initialize = _maxpool_initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_MAXPOOL_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
int32_t pool_type = vsi_nn_kernel_param_get_int32( params, "pool_type" );
int32_t pool_size_x = vsi_nn_kernel_param_get_int32( params, "pool_size_x" );
int32_t pool_size_y = vsi_nn_kernel_param_get_int32( params, "pool_size_y" );
int32_t pool_pad_x_left = vsi_nn_kernel_param_get_int32( params, "pool_pad_x_left" );
int32_t pool_pad_y_top = vsi_nn_kernel_param_get_int32( params, "pool_pad_y_top" );
int32_t stride_x = vsi_nn_kernel_param_get_int32( params, "stride_x" );
int32_t stride_y = vsi_nn_kernel_param_get_int32( params, "stride_y" );
int32_t dilation_x = vsi_nn_kernel_param_get_int32( params, "dilation_x" );
int32_t dilation_y = vsi_nn_kernel_param_get_int32( params, "dilation_y" );
int32_t kernel_dia_x = pool_size_x * dilation_x;
int32_t kernel_dia_y = pool_size_y * dilation_y;
float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float inout_scale = input_scale / output_scale;
float inout_tail = output_zp - input_zp * inout_scale;
int32_t width = (int32_t)inputs[0]->attr.size[0];
int32_t height = (int32_t)inputs[0]->attr.size[1];
if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
inputs[0]->attr.dim_num )
|| !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ))
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs, pool_type );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
uint32_t index = 2;
vsi_nn_kernel_node_pack_io( node_params, _MAXPOOL_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_x_left );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_y_top );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_x );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_y );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_x );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_y );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inout_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inout_tail );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOL_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &node_params[2] );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
vsi_nn_kernel_scalar_release( &node_params[10] );
vsi_nn_kernel_scalar_release( &node_params[11] );
vsi_nn_kernel_scalar_release( &node_params[12] );
vsi_nn_kernel_scalar_release( &node_params[13] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( pool, _setup )

View File

@ -590,7 +590,7 @@ static vsi_nn_kernel_node_t _setup
ret = vsi_nn_kernel_optimize_element_shape( ret = vsi_nn_kernel_optimize_element_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank); inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);
if ( ret ) if ( !ret )
{ {
return NULL; return NULL;
} }

View File

@ -349,7 +349,7 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
input1Scale = (float)((int64_t)1 << -fl); input1Scale = (float)((int64_t)1 << -fl);
} }
} }
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{ {
input1Scale = attr[1]->asymm.scale; input1Scale = attr[1]->asymm.scale;
input1Tail = 0 - attr[1]->asymm.zero_point * input1Scale; input1Tail = 0 - attr[1]->asymm.zero_point * input1Scale;

View File

@ -866,21 +866,13 @@ static vsi_nn_kernel_node_t _setup
if (axis < 0) if (axis < 0)
{ {
axis_new = 0; axis += (int32_t)inputs[0]->attr.dim_num;
shapes[0][0] = 1;
shapes[0][1] = 1;
for (i = 0; i < inputs[0]->attr.dim_num; i++)
{
shapes[0][0] *= inputs[0]->attr.size[i];
} }
rs_dim = 2;
}
else
{
vsi_nn_kernel_optimize_softmax_shape( vsi_nn_kernel_optimize_softmax_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
shapes[0], &rs_dim, &axis_new); shapes[0], &rs_dim, &axis_new);
}
if (rs_dim > 3) if (rs_dim > 3)
{ {
return NULL; return NULL;

View File

@ -250,7 +250,8 @@ static vsi_status get_gather_tensor_reshape_size
sizes[0] = block_size; sizes[0] = block_size;
sizes[1] = elementCnt / block_size; sizes[1] = elementCnt / block_size;
sizes[2] = outerCnt; sizes[2] = outerCnt;
if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH) if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH ||
block_size > VSI_NN_MAX_IMAGE_WIDTH)
{ {
arrayFlg[0] = 1; arrayFlg[0] = 1;
} }
@ -490,6 +491,8 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
float src0Scale = 1; float src0Scale = 1;
int32_t dstZP = 0; int32_t dstZP = 0;
float dstScale = 1; float dstScale = 1;
int32_t remainder = 0;
int32_t width = 0;
uint32_t pack_key = 0; uint32_t pack_key = 0;
@ -546,6 +549,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
indices_num *= (int32_t)(input1_shape->data[i]); indices_num *= (int32_t)(input1_shape->data[i]);
} }
batch = (int32_t)(input1_shape->data[input_dims1 - 1]); batch = (int32_t)(input1_shape->data[input_dims1 - 1]);
width = (int32_t)(input1_shape->data[0]);
shaderParam.global_scale[0] = 4; shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1; shaderParam.global_scale[1] = 1;
@ -562,6 +566,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
(IN0_TYPE | (OUT_TYPE << 8)) (IN0_TYPE | (OUT_TYPE << 8))
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype); pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype);
remainder = indices_num % 4;
{ {
uint16_t M0 = 0; uint16_t M0 = 0;
@ -656,6 +661,8 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
{ {
status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch); status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
} }
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
CHECK_STATUS_FAIL_GOTO(status, OnError ); CHECK_STATUS_FAIL_GOTO(status, OnError );
OnError: OnError:
@ -763,20 +770,36 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_node_t node = NULL;
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t block_size = 1;
int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); int32_t block_num = 1;
int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); int32_t axis_num = 0;
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" ); int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
int32_t axis0_flg = 0; int32_t axis0_flg = 0;
int32_t is_array = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0; int32_t is_array = 0;
int32_t is_batch = batch_dims > 0 ? 1 : 0; int32_t is_batch = batch_dims > 0 ? 1 : 0;
vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3; vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3;
int32_t i = 0; vsi_size_t *input_size = inputs[0]->attr.size;
uint32_t i = 0;
uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num;
VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(output_num);
for (i = 0; i < (uint32_t)axis; ++i)
{
block_size *= (int32_t)input_size[i];
}
axis_num = (int32_t)input_size[axis];
for (i = axis + 1; i < r_rank - batch_dims; ++i)
{
block_num *= (int32_t)input_size[i];
}
is_array = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
if (axis == 0) if (axis == 0)
{ {
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, batch_dims, 0, &is_array); status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, batch_dims, 0, &is_array);

View File

@ -47,11 +47,10 @@ typedef enum
} _internal_kernel_e; } _internal_kernel_e;
#define _GRUCELL_ACTIVATION_KERNEL_SOURCE "grucell_activation" #define _GRUCELL_ACTIVATION_KERNEL_SOURCE "grucell_activation"
#define _GRUCELL_ACTIVATION_KERNEL_NAME CVIVANTE_NAMESPACE("evis.grucell_activation")
#define _CDNN_KERNEL_SOURCE0 "grucell_cdnn_activation" #define _CDNN_KERNEL_SOURCE0 "grucell_cdnn_activation"
#define _CDNN_KERNEL_SOURCE1 "grucell_cdnn_activation_u8" #define _CDNN_KERNEL_SOURCE1 "grucell_cdnn_activation_u8"
#define _GRUCELL_ACTIVATION_CDNN_KERNEL_NAME CVIVANTE_NAMESPACE("evis.grucell_cdnn_activation") #define _KERNEL_SOURCE2 "grucell_cdnn_activation_bf16"
typedef enum _batch_fisrt_layerout_e typedef enum _batch_fisrt_layerout_e
{ {
@ -114,6 +113,11 @@ static const _kernel_map_type _grucell_activation_kernel_map[] =
PACK_KERNEL_MAP( U8, U8, U8, U8, hsigmoid, VSI_NN_ACT_TANH, CN), PACK_KERNEL_MAP( U8, U8, U8, U8, hsigmoid, VSI_NN_ACT_TANH, CN),
PACK_KERNEL_MAP( F16, F16, F16, F16, hsigmoid, VSI_NN_ACT_TANH, CN), PACK_KERNEL_MAP( F16, F16, F16, F16, hsigmoid, VSI_NN_ACT_TANH, CN),
PACK_KERNEL_MAP( F16, F16, F16, U8, hsigmoid, VSI_NN_ACT_TANH, CN), PACK_KERNEL_MAP( F16, F16, F16, U8, hsigmoid, VSI_NN_ACT_TANH, CN),
PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, sigmoid, VSI_NN_ACT_TANH, NC),
PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, sigmoid, VSI_NN_ACT_TANH, CN),
PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, hsigmoid, VSI_NN_ACT_TANH, NC),
PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, hsigmoid, VSI_NN_ACT_TANH, CN),
}; };
static const _kernel_map_type _grucell_cunn_sep_activation_kernel_map[] = static const _kernel_map_type _grucell_cunn_sep_activation_kernel_map[] =
@ -130,6 +134,12 @@ static const _kernel_map_type _grucell_cunn_sep_activation_kernel_map[] =
PACK_KERNEL_CDNN_SEP_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE1 ), PACK_KERNEL_CDNN_SEP_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE1 ),
PACK_KERNEL_CDNN_SEP_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN_FULL, _CDNN_KERNEL_SOURCE1 ), PACK_KERNEL_CDNN_SEP_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN_FULL, _CDNN_KERNEL_SOURCE1 ),
PACK_KERNEL_CDNN_SEP_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _KERNEL_SOURCE2 ),
PACK_KERNEL_CDNN_SEP_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _KERNEL_SOURCE2 ),
PACK_KERNEL_CDNN_SEP_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN_FULL, _KERNEL_SOURCE2 ),
}; };
static const _kernel_map_type _grucell_cunn_activation_kernel_map[] = static const _kernel_map_type _grucell_cunn_activation_kernel_map[] =
@ -142,6 +152,10 @@ static const _kernel_map_type _grucell_cunn_activation_kernel_map[] =
PACK_KERNEL_CDNN_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _CDNN_KERNEL_SOURCE1 ), PACK_KERNEL_CDNN_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _CDNN_KERNEL_SOURCE1 ),
PACK_KERNEL_CDNN_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE1 ), PACK_KERNEL_CDNN_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE1 ),
PACK_KERNEL_CDNN_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _KERNEL_SOURCE2 ),
PACK_KERNEL_CDNN_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _KERNEL_SOURCE2 ),
}; };
/* /*
@ -322,6 +336,37 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
"tensorScale", &tensorScale ); "tensorScale", &tensorScale );
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
} }
break;
case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "tensorZP", &tensorZP );
status |= vsi_nn_kernel_gpu_add_param( node, "tensorScale", &tensorScale );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
default: default:
break; break;
} }
@ -604,6 +649,34 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_cdnn_initializer)
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
} }
break; break;
case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
default: default:
break; break;
} }

View File

@ -81,8 +81,10 @@ typedef struct
static const _kernel_map_type _grucell_activation_sma_kernel_map[] = static const _kernel_map_type _grucell_activation_sma_kernel_map[] =
{ {
PACK_KERNEL_MAP(F16, F16, F16, F16), PACK_KERNEL_MAP(F16, F16, F16, F16),
PACK_KERNEL_MAP(BF16, BF16, BF16, BF16),
PACK_KERNEL_MAP_2D(F16, F16, F16, F16), PACK_KERNEL_MAP_2D(F16, F16, F16, F16),
PACK_KERNEL_MAP_2D(BF16, BF16, BF16, BF16),
}; };
/* /*
@ -200,6 +202,45 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer)
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
} }
break; break;
case _PACK_A_GRUCELL_ACTIVATION_SMA_KEY(BF16, BF16, BF16, BF16):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
default: default:
break; break;
} }

View File

@ -72,10 +72,12 @@ static const _kernel_map_type _grucell_activation_z_h_kernel_map[] =
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ), PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ),
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ), PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ),
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID ),
PACK_KERNEL_MAP( U8, F16, U8, HSIGMOID ), PACK_KERNEL_MAP( U8, F16, U8, HSIGMOID ),
PACK_KERNEL_MAP( I8, F16, I8, HSIGMOID ), PACK_KERNEL_MAP( I8, F16, I8, HSIGMOID ),
PACK_KERNEL_MAP( I16, F16, I16, HSIGMOID ), PACK_KERNEL_MAP( I16, F16, I16, HSIGMOID ),
PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ), PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
PACK_KERNEL_MAP( BF16, BF16, BF16, HSIGMOID ),
}; };
/* /*
@ -218,6 +220,34 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
} }
break; break;
case _PACK_SELECT_KEY(BF16, BF16, BF16):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case _PACK_SELECT_KEY(U8, F16, U8): case _PACK_SELECT_KEY(U8, F16, U8):
case _PACK_SELECT_KEY(I8, F16, I8): case _PACK_SELECT_KEY(I8, F16, I8):
case _PACK_SELECT_KEY(I16, F16, I16): case _PACK_SELECT_KEY(I16, F16, I16):

View File

@ -71,10 +71,12 @@ static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] =
PACK_KERNEL_MAP( I8, F16, F16, SIGMOID ), PACK_KERNEL_MAP( I8, F16, F16, SIGMOID ),
PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ), PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ),
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID ),
PACK_KERNEL_MAP( U8, F16, F16, HSIGMOID ), PACK_KERNEL_MAP( U8, F16, F16, HSIGMOID ),
PACK_KERNEL_MAP( I8, F16, F16, HSIGMOID ), PACK_KERNEL_MAP( I8, F16, F16, HSIGMOID ),
PACK_KERNEL_MAP( I16, F16, F16, HSIGMOID ), PACK_KERNEL_MAP( I16, F16, F16, HSIGMOID ),
PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ), PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
PACK_KERNEL_MAP( BF16, BF16, BF16, HSIGMOID ),
}; };
/* /*
@ -194,6 +196,34 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
} }
break; break;
case _PACK_SELECT_KEY(BF16, BF16, BF16):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case _PACK_SELECT_KEY(U8, F16, F16): case _PACK_SELECT_KEY(U8, F16, F16):
case _PACK_SELECT_KEY(I8, F16, F16): case _PACK_SELECT_KEY(I8, F16, F16):
case _PACK_SELECT_KEY(I16, F16, F16): case _PACK_SELECT_KEY(I16, F16, F16):

View File

@ -74,10 +74,12 @@ static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, TANH ), PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, TANH ),
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, TANH ), PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, TANH ),
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, TANH ), PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, TANH ),
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, TANH ),
PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, SIGMOID ), PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, SIGMOID ),
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, SIGMOID ), PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, SIGMOID ),
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ), PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ),
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ), PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ),
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, SIGMOID ),
}; };
@ -224,6 +226,34 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
CHECK_STATUS_FAIL_GOTO(status, final ); CHECK_STATUS_FAIL_GOTO(status, final );
} }
break; break;
case _PACK_SELECT_KEY(BF16, BF16, BF16):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case _PACK_SELECT_KEY(U8, F16, U8): case _PACK_SELECT_KEY(U8, F16, U8):
case _PACK_SELECT_KEY(I8, F16, I8): case _PACK_SELECT_KEY(I8, F16, I8):
case _PACK_SELECT_KEY(I16, F16, I16): case _PACK_SELECT_KEY(I16, F16, I16):

View File

@ -439,6 +439,32 @@ static const _kernel_map_type _lstmunit_activation_kernel_map[] =
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I16, F16, HARD_SIGMOID, SP) GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I16, F16, HARD_SIGMOID, SP)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I8, F16, HARD_SIGMOID, SP) GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I8, F16, HARD_SIGMOID, SP)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, U8, F16, HARD_SIGMOID, SP) GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, U8, F16, HARD_SIGMOID, SP)
/* BF16 type */
GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, BF16, BF16, BF16, SIGMOID, CLP)
GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, BF16, BF16, BF16, SIGMOID, LP)
GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, BF16, BF16, BF16, SIGMOID, CL)
GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, BF16, BF16, BF16, SIGMOID, L)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, BF16, BF16, BF16, SIGMOID, BP)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, BF16, BF16, BF16, SIGMOID, B)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, BF16, BF16, BF16, SIGMOID, CBP)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, BF16, BF16, BF16, SIGMOID, CB)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, BF16, BF16, BF16, SIGMOID, SP)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, BF16, BF16, BF16, SIGMOID, S)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, BF16, BF16, BF16, SIGMOID, CSP)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, BF16, BF16, BF16, SIGMOID, CS)
GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CLP)
GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, LP)
GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CL)
GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, L)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, BP)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, B)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, CBP)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, CB)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, SP)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, S)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CSP)
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CS)
}; };
@ -1135,6 +1161,26 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16}; }, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
if (dstQuantType == VSI_NN_KERNEL_QUANT_DFP) if (dstQuantType == VSI_NN_KERNEL_QUANT_DFP)
{ {
@ -1152,31 +1198,41 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
if ( cellFormat == F16 ) if ( cellFormat == F16 )
{ {
vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4);
} }
if ( dstFormat == F16 ) if ( dstFormat == F16 )
{ {
vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
}
else if ( dstFormat != BF16 )
{
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
}
if ( cellFormat == BF16 && dstFormat == BF16)
{
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
} }
else else
{ {
vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4);
} }
CHECK_STATUS_FAIL_GOTO(status, final );
vsi_nn_kernel_gpu_add_param(node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4); status = vsi_nn_kernel_gpu_add_param(node, "logE", &logE);
vsi_nn_kernel_gpu_add_param(node, "logE", &logE); status |= vsi_nn_kernel_gpu_add_param(node, "twoLogE", &twoLogE);
vsi_nn_kernel_gpu_add_param(node, "twoLogE", &twoLogE); status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale);
vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP);
vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); status |= vsi_nn_kernel_gpu_add_param(node, "forget_bias", &forget_bias);
vsi_nn_kernel_gpu_add_param(node, "forget_bias", &forget_bias); status |= vsi_nn_kernel_gpu_add_param(node, "clip_Min_F", clip_Min_F);
vsi_nn_kernel_gpu_add_param(node, "clip_Min_F", clip_Min_F); status |= vsi_nn_kernel_gpu_add_param(node, "clip_Max_F", clip_Max_F);
vsi_nn_kernel_gpu_add_param(node, "clip_Max_F", clip_Max_F);
if ( !_is_ln && input_attr[S_INPUT_FC_F]->dtype == F16 ) if ( !_is_ln && input_attr[S_INPUT_FC_F]->dtype == F16 )
{ {
vsi_nn_kernel_gpu_add_param(node, "uniFp16AddFp16toFp32_4x4", &uniFp16AddFp16toFp32_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16AddFp16toFp32_4x4", &uniFp16AddFp16toFp32_4x4);
} }
CHECK_STATUS_FAIL_GOTO(status, final );
if (input_attr[S_INPUT_FC_F]->dtype == U8 && if (input_attr[S_INPUT_FC_F]->dtype == U8 &&
input_attr[S_INPUT_FC_F]->quant == VSI_NN_KERNEL_QUANT_ASYMM) input_attr[S_INPUT_FC_F]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
@ -1380,8 +1436,8 @@ static vsi_status _query_kernel
vx_param_description_t * param_def = NULL; vx_param_description_t * param_def = NULL;
size_t param_def_size = _LSTMUNIT_ACTIVATION_MAX_PARAM_NUM; size_t param_def_size = _LSTMUNIT_ACTIVATION_MAX_PARAM_NUM;
vx_kernel_initialize_f initializer = _lstmunit_activation_initializer; vx_kernel_initialize_f initializer = _lstmunit_activation_initializer;
uint32_t key; uint32_t key = 0;
uint32_t i; uint32_t i = 0;
set_vx_param_description_t( lstm_activation, &param_def ); set_vx_param_description_t( lstm_activation, &param_def );

View File

@ -36,6 +36,7 @@
#include "utils/vsi_nn_util.h" #include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h" #include "libnnext/vx_lib_nnext.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS __BEGIN_DECLS
@ -1576,21 +1577,22 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t tmp_params[_MATRIX_MUL_CROSS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_param_t tmp_params[_MATRIX_MUL_CROSS_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
vsi_nn_tensor_t* tmp_inputs[2] = {NULL};
vsi_nn_tensor_t* tmp_outputs[1] = {NULL};
int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" ); int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" );
int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" );
int32_t adjointA = vsi_nn_kernel_param_get_int32( params, "adjointA" ); int32_t adjointA = vsi_nn_kernel_param_get_int32( params, "adjointA" );
int32_t adjointB = vsi_nn_kernel_param_get_int32( params, "adjointB" ); int32_t adjointB = vsi_nn_kernel_param_get_int32( params, "adjointB" );
uint32_t cross_flg = vsi_nn_kernel_param_get_int32( params, "cross_flg" ); vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
size_t tmp_size = 0; uint32_t new_rank[3] = {0};
uint32_t* size_axis_in_out = NULL;
uint32_t* stride_axis_in_out = NULL;
vsi_size_t M = inputs[0]->attr.size[1]; vsi_size_t M = inputs[0]->attr.size[1];
vsi_size_t K = inputs[0]->attr.size[0]; vsi_size_t K = inputs[0]->attr.size[0];
vsi_size_t N = inputs[1]->attr.size[0]; vsi_size_t N = inputs[1]->attr.size[0];
vsi_size_t depthA = 1, depthB = 1; vsi_size_t depthA = 1, depthB = 1;
size_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "size_axis_inner_outer", &tmp_size); uint32_t cross_flg = 0;
stride_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "stride_axis_inner_outer", &tmp_size); uint32_t size_axis_in_out[3] = {0};
uint32_t stride_axis_in_out[9] = {0};
VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(output_num);
@ -1609,35 +1611,62 @@ static vsi_nn_kernel_node_t _setup
return NULL; return NULL;
} }
status = vsi_nn_kernel_optimize_matrixmul_broadcast_shape(
inputs[0]->attr.size,
inputs[1]->attr.size,
outputs[0]->attr.size,
inputs[0]->attr.dim_num,
inputs[1]->attr.dim_num,
outputs[0]->attr.dim_num,
shapes[0], shapes[1], shapes[2], new_rank,
&cross_flg, size_axis_in_out, stride_axis_in_out);
if (status)
{
tmp_inputs[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], new_rank[0]);
tmp_inputs[1] = vsi_nn_reshape_tensor(graph, inputs[1], shapes[1], new_rank[1]);
tmp_outputs[0] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[2], new_rank[2]);
M = tmp_inputs[0]->attr.size[1];
K = tmp_inputs[0]->attr.size[0];
N = tmp_inputs[1]->attr.size[0];
}
else
{
VSILOGE("illegal inputs shape");
status = VSI_FAILURE;
goto final;
}
if (transposeA) if (transposeA)
{ {
K = inputs[0]->attr.size[1]; K = tmp_inputs[0]->attr.size[1];
M = inputs[0]->attr.size[0]; M = tmp_inputs[0]->attr.size[0];
} }
else if (transposeB) else if (transposeB)
{ {
N = inputs[1]->attr.size[1]; N = tmp_inputs[1]->attr.size[1];
} }
depthA = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; depthA = tmp_inputs[0]->attr.dim_num > 2 ? tmp_inputs[0]->attr.size[2] : 1;
depthB = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1; depthB = tmp_inputs[1]->attr.dim_num > 2 ? tmp_inputs[1]->attr.size[2] : 1;
if (M == 1 && depthB == 1 && depthA > 1) if (M == 1 && depthB == 1 && depthA > 1)
{ {
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[0]->attr.size[0]; shape[0] = tmp_inputs[0]->attr.size[0];
shape[1] = inputs[0]->attr.size[2]; shape[1] = tmp_inputs[0]->attr.size[2];
shape[2] = 1; shape[2] = 1;
shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; shape[3] = tmp_inputs[0]->attr.dim_num > 3 ? tmp_inputs[0]->attr.size[3] : 1;
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); rs_input = vsi_nn_kernel_tensor_reshape( tmp_inputs[0]->t, shape, 4 );
shape[0] = outputs[0]->attr.size[0]; shape[0] = tmp_outputs[0]->attr.size[0];
shape[1] = outputs[0]->attr.size[2]; shape[1] = tmp_outputs[0]->attr.size[2];
shape[2] = 1; shape[2] = 1;
shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; shape[3] = tmp_outputs[0]->attr.dim_num > 3 ? tmp_outputs[0]->attr.size[3] : 1;
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); rs_output = vsi_nn_kernel_tensor_reshape( tmp_outputs[0]->t, shape, 4 );
} }
status = _query_kernel( inputs, outputs, kernel, transposeA, transposeB, cross_flg ); status = _query_kernel( tmp_inputs, tmp_outputs, kernel, transposeA, transposeB, cross_flg );
if ( VSI_SUCCESS == status) if ( VSI_SUCCESS == status)
{ {
node = vsi_nn_kernel_create_node( graph, kernel ); node = vsi_nn_kernel_create_node( graph, kernel );
@ -1649,13 +1678,13 @@ static vsi_nn_kernel_node_t _setup
if (rs_input) if (rs_input)
{ {
tmp_params[0] = rs_input; tmp_params[0] = rs_input;
tmp_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t); tmp_params[1] = (vsi_nn_kernel_node_param_t)(tmp_inputs[1]->t);
tmp_params[2] = rs_output; tmp_params[2] = rs_output;
} }
else else
{ {
vsi_nn_kernel_node_pack_io( tmp_params, param_num, vsi_nn_kernel_node_pack_io( tmp_params, param_num,
inputs, 2, outputs, 1 ); tmp_inputs, 2, tmp_outputs, 1 );
} }
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeA ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeA );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeB ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeB );
@ -1725,6 +1754,10 @@ static vsi_nn_kernel_node_t _setup
} }
} }
} }
final:
vsi_safe_release_tensor( tmp_inputs[0] );
vsi_safe_release_tensor( tmp_inputs[1] );
vsi_safe_release_tensor( tmp_outputs[0] );
if (rs_input) if (rs_input)
{ {
vsi_nn_kernel_tensor_release( &rs_input ); vsi_nn_kernel_tensor_release( &rs_input );

View File

@ -0,0 +1,374 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
__BEGIN_DECLS
#define KERNEL_SOURCE_0 "maxpool",
typedef enum
{
_error = -1,
_MAX = 0,
_AVG
} vsi_nn_pool_type_e;
#define HASH_POOL_KEY(_input_type, _output_type, _pool_type, _image_2d) \
((_input_type << 24) | (_output_type << 16) | (_pool_type << 8) | (_image_2d))
#define HASH_MAXPOOL_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.maxpool_"#SRC_TYPE"to"#DST_TYPE)
#define MAXPOOL_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_POOL_KEY(IN0_TYPE, OUT_TYPE, _MAX, 0), \
HASH_MAXPOOL_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
const char* source_name;
} kernel_map[] =
{
MAXPOOL_KERNELS(F16, F16, KERNEL_SOURCE_0)
MAXPOOL_KERNELS(BF16, BF16, KERNEL_SOURCE_0)
MAXPOOL_KERNELS(I8, I8, KERNEL_SOURCE_0)
MAXPOOL_KERNELS(U8, U8, KERNEL_SOURCE_0)
MAXPOOL_KERNELS(I16, I16, KERNEL_SOURCE_0)
MAXPOOL_KERNELS(U8, F16, KERNEL_SOURCE_0)
MAXPOOL_KERNELS(I8, F16, KERNEL_SOURCE_0)
MAXPOOL_KERNELS(I16, F16, KERNEL_SOURCE_0)
MAXPOOL_KERNELS(F16, I8, KERNEL_SOURCE_0)
MAXPOOL_KERNELS(F16, U8, KERNEL_SOURCE_0)
MAXPOOL_KERNELS(F16, I16, KERNEL_SOURCE_0)
};
static vx_param_description_t kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
};
#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def)
DEF_KERNEL_INITIALIZER(_maxpool_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
float input_zp = 0.0f;
float input_scale = 1.0f;
float output_zp = 0;
float output_scale = 1.0f;
float inout_scale = 1.0f;
float inout_tail = 0.0f;
int32_t width = 0;
int32_t height = 0;
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL;
uint32_t pack_key = 0;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
out_shape = attr[1]->shape;
width = (int32_t)attr[0]->shape->data[0];
height = (int32_t)attr[0]->shape->data[1];
input_scale = attr[0]->scale;
input_zp = (float)attr[0]->zero_point;
output_scale = attr[1]->scale;
output_zp = (float)attr[1]->zero_point;
inout_scale = input_scale / output_scale;
inout_tail = output_zp - input_zp * inout_scale;
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \
(IN0_TYPE | ( OUT_TYPE << 16))
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype );
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0];
gpu_param.global_size[1] = (
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
{
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvF16toFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inout_scale );
status |= vsi_nn_kernel_gpu_add_param( node, "inout_tail", &inout_tail );
status |= vsi_nn_kernel_gpu_add_param( node, "width", &width );
status |= vsi_nn_kernel_gpu_add_param( node, "height", &height );
CHECK_STATUS_FAIL_GOTO(status, final);
switch( pack_key )
{
case _PACK_SELECT_KEY( I8, I8 ):
case _PACK_SELECT_KEY( U8, U8 ):
case _PACK_SELECT_KEY( I16, I16 ):
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case _PACK_SELECT_KEY( F16, I8 ):
case _PACK_SELECT_KEY( F16, U8 ):
case _PACK_SELECT_KEY( F16, I16 ):
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvF16toFp32_4x4", &uniConvF16toFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case _PACK_SELECT_KEY( BF16, BF16 ):
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
default:
break;
}
}
#undef _PACK_SELECT_KEY
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _maxpool_initializer() */
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
int32_t pool_type,
vsi_nn_kernel_t* kernel
)
{
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_POOL_KEY( input0_dtype, output_dtype, pool_type, 0 );
for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(kernel_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = kernel_param_def;
kernel->info.numParams = _cnt_of_array( kernel_param_def );
kernel->info.initialize = _maxpool_initializer;
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_EVIS_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t pool_type = vsi_nn_kernel_param_get_int32( params, "pool_type" );
int32_t pool_size_x = vsi_nn_kernel_param_get_int32( params, "pool_size_x" );
int32_t pool_size_y = vsi_nn_kernel_param_get_int32( params, "pool_size_y" );
int32_t pool_pad_x_left = vsi_nn_kernel_param_get_int32( params, "pool_pad_x_left" );
int32_t pool_pad_y_top = vsi_nn_kernel_param_get_int32( params, "pool_pad_y_top" );
int32_t stride_x = vsi_nn_kernel_param_get_int32( params, "stride_x" );
int32_t stride_y = vsi_nn_kernel_param_get_int32( params, "stride_y" );
int32_t dilation_x = vsi_nn_kernel_param_get_int32( params, "dilation_x" );
int32_t dilation_y = vsi_nn_kernel_param_get_int32( params, "dilation_y" );
int32_t kernel_dia_x = pool_size_x * dilation_x;
int32_t kernel_dia_y = pool_size_y * dilation_y;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(params);
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, pool_type, kernel );
if ( VSI_SUCCESS == status )
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 2;
/* Pass parameters to node. */
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
inputs, 1, outputs, 1 );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_x_left );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_y_top );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_x );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_y );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_x );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_y );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &tmp_params[2] );
vsi_nn_kernel_scalar_release( &tmp_params[3] );
vsi_nn_kernel_scalar_release( &tmp_params[4] );
vsi_nn_kernel_scalar_release( &tmp_params[5] );
vsi_nn_kernel_scalar_release( &tmp_params[6] );
vsi_nn_kernel_scalar_release( &tmp_params[7] );
vsi_nn_kernel_scalar_release( &tmp_params[8] );
vsi_nn_kernel_scalar_release( &tmp_params[9] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( pool, _setup )

View File

@ -117,6 +117,17 @@ static vx_param_description_t vxPreProcessNv12Kernel_param_def[] =
}; };
#define _EVIS_PRE_PROCESS_NV12_PARAM_NUM _cnt_of_array(vxPreProcessNv12Kernel_param_def) #define _EVIS_PRE_PROCESS_NV12_PARAM_NUM _cnt_of_array(vxPreProcessNv12Kernel_param_def)
static vsi_bool _check_nv12_type_from_env()
{
vsi_bool ret = FALSE;
char* env_s = vsi_nn_getenv("VSI_NN_ENABLE_OCV_NV12");
if (env_s)
{
ret = TRUE;
}
return ret;
}
DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
( (
vsi_nn_kernel_node_t node, vsi_nn_kernel_node_t node,
@ -145,6 +156,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL; vsi_size_array_t * out_shape = NULL;
vsi_bool ocv_nv12 = _check_nv12_type_from_env();
VSI_UNREFERENCED(param_size); VSI_UNREFERENCED(param_size);
@ -208,7 +220,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 }; }, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertNV12toB_4x4 = {{ gpu_dp_inst_t uniConvertNV12toB_4x4 = {{
0x05050505, // TCfg 0x05050505, // TCfg
0x04040404, // ASelt 0x04040404, // ASelt
@ -239,6 +250,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16 }; }, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractYtoShortSub16_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{ gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
0x99999999, // TCfg 0x99999999, // TCfg
0x44444444, // ASelt 0x44444444, // ASelt
@ -259,6 +281,61 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 }; }, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000,
0x00030002, // ABin
0x02020202, // BSelt
0x00000000,
0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (ocv_nv12)
{
uniConvertNV12toB_4x4.data[2] = 0x00010000;
uniConvertNV12toB_4x4.data[3] = 0x00230022;
uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
uniConvertNV12toG_4x4.data[2] = 0x01010100;
uniConvertNV12toG_4x4.data[3] = 0x03230322;
uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
uniConvertNV12toG_4x4.data[9] = 0x00003a81;
uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
uniConvertNV12toG_4x4.data[11] = 0x00003a81;
uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
uniConvertNV12toG_4x4.data[13] = 0x00003a81;
uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
uniConvertNV12toG_4x4.data[15] = 0x00003a81;
uniConvertNV12toR_4x4.data[2] = 0x00110010;
uniConvertNV12toR_4x4.data[3] = 0x00330032;
uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
uniExtractUVtoCharSub128_2x8.data[2] = 0x03020100;
uniExtractUVtoCharSub128_2x8.data[3] = 0x07060504;
uniExtractYtoShortSub16_2x8.data[0] = 0x99999999;
uniExtractYtoShortSub16_2x8.data[1] = 0x44444444;
uniExtractYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
uniExtractYtoShortSub16_2x8.data[8] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[9] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[10] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[11] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[12] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[13] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[14] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[15] = 0x00010001;
}
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4); status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
@ -266,12 +343,15 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_2x8", &uniExtractYtoShortSub16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b); status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g); status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r); status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
CHECK_STATUS_FAIL_GOTO(status, OnError);
switch( attr[0]->dtype ) switch( attr[0]->dtype )
{ {
case U8: case U8:
@ -335,6 +415,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f; float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
float resize = 0.0f; float resize = 0.0f;
vsi_bool ocv_nv12 = _check_nv12_type_from_env();
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL; vsi_size_array_t * out_shape = NULL;
@ -445,6 +526,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16 }; }, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertYtoShortSub16_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
0x11111111, // TCfg 0x11111111, // TCfg
0x11110000, // ASelt 0x11110000, // ASelt
@ -487,11 +579,64 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000,
0x00000000, 0x00010000, 0x00000000, 0x00010000 // Constant 0x00000000, 0x00010000, 0x00000000, 0x00010000 // Constant
}, GPU_DP_TYPE_16 }; }, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000,
0x00030002, // ABin
0x02020202, // BSelt
0x00000000,
0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (ocv_nv12)
{
uniConvertNV12toB_4x4.data[2] = 0x00010000;
uniConvertNV12toB_4x4.data[3] = 0x00230022;
uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
uniConvertNV12toG_4x4.data[2] = 0x01010100;
uniConvertNV12toG_4x4.data[3] = 0x03230322;
uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
uniConvertNV12toG_4x4.data[9] = 0x00003a81;
uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
uniConvertNV12toG_4x4.data[11] = 0x00003a81;
uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
uniConvertNV12toG_4x4.data[13] = 0x00003a81;
uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
uniConvertNV12toG_4x4.data[15] = 0x00003a81;
uniConvertNV12toR_4x4.data[2] = 0x00110010;
uniConvertNV12toR_4x4.data[3] = 0x00330032;
uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
uniConvertYtoShortSub16_2x8.data[0] = 0x99999999;
uniConvertYtoShortSub16_2x8.data[1] = 0x44444444;
uniConvertYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
uniConvertYtoShortSub16_2x8.data[8] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[9] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[10] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[11] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[12] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[13] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[14] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[15] = 0x00010001;
}
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4); status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYtoShortSub16_2x8", &uniConvertYtoShortSub16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16); status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16); status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b); status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
@ -506,6 +651,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8);
} }
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
CHECK_STATUS_FAIL_GOTO(status, OnError ); CHECK_STATUS_FAIL_GOTO(status, OnError );
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);

View File

@ -249,6 +249,18 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 }; }, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000,
0x00030002, // ABin
0x02020202, // BSelt
0x00000000,
0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4); status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4);
@ -262,6 +274,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
CHECK_STATUS_FAIL_GOTO(status, OnError);
switch( attr[0]->dtype ) switch( attr[0]->dtype )
{ {
case U8: case U8:
@ -461,6 +475,18 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 }; }, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000,
0x00030002, // ABin
0x02020202, // BSelt
0x00000000,
0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4); status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4);
@ -477,6 +503,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_4x4", &uniExtractYtoShortSub16_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_4x4", &uniExtractYtoShortSub16_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
CHECK_STATUS_FAIL_GOTO(status, OnError ); CHECK_STATUS_FAIL_GOTO(status, OnError );
switch( attr[0]->dtype ) switch( attr[0]->dtype )

View File

@ -664,9 +664,15 @@ static vsi_nn_kernel_node_t _setup
hashkeys[1] = BILINEAR_NHWC_BOUND_HASH_KEY( in_dtype, out_dtype, (vsi_size_t)up_scale ); hashkeys[1] = BILINEAR_NHWC_BOUND_HASH_KEY( in_dtype, out_dtype, (vsi_size_t)up_scale );
status = _query_kernel( ikernels[0], hashkeys[0], 0); status = _query_kernel( ikernels[0], hashkeys[0], 0);
CHECK_STATUS_FAIL_GOTO(status, final ); if (status != VSI_SUCCESS)
{
goto final;
}
status = _query_kernel( kernel, hashkeys[1], 1); status = _query_kernel( kernel, hashkeys[1], 1);
CHECK_STATUS_FAIL_GOTO(status, final ); if (status != VSI_SUCCESS)
{
goto final;
}
shapes[0][0] = depth * inputs[0]->attr.size[1]; shapes[0][0] = depth * inputs[0]->attr.size[1];
shapes[0][1] = inputs[0]->attr.size[2]; shapes[0][1] = inputs[0]->attr.size[2];

View File

@ -532,10 +532,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
width = (width + 15) / 16; width = (width + 15) / 16;
} }
input0_zp = attr[0]->asymm.zero_point; input0_zp = attr[0]->zero_point;
input0_scale = attr[0]->asymm.scale; input0_scale = attr[0]->scale;
output_zp = attr[1]->asymm.zero_point; output_zp = attr[1]->zero_point;
output_scale = 1.0f / attr[1]->asymm.scale; output_scale = 1.0f / attr[1]->scale;
gpu_param.global_scale[0] = 1; gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1; gpu_param.global_scale[1] = 1;
@ -670,10 +670,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
update_width = (int32_t)(attr[1]->shape->data[0]); update_width = (int32_t)(attr[1]->shape->data[0]);
index_num = (int32_t)(attr[0]->shape->data[1]); index_num = (int32_t)(attr[0]->shape->data[1]);
input1_zp = attr[1]->asymm.zero_point; input1_zp = attr[1]->zero_point;
input1_scale = attr[1]->asymm.scale; input1_scale = attr[1]->scale;
output_zp = attr[2]->asymm.zero_point; output_zp = attr[2]->zero_point;
output_scale = 1.0f / attr[2]->asymm.scale; output_scale = 1.0f / attr[2]->scale;
if (coord_dim == 5) if (coord_dim == 5)
{ {
@ -916,10 +916,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer)
} }
width = element_size / 8; width = element_size / 8;
input_zp0 = attr[0]->asymm.zero_point; input_zp0 = attr[0]->zero_point;
input_scale0 = attr[0]->asymm.scale; input_scale0 = attr[0]->scale;
output_zp = attr[1]->asymm.zero_point; output_zp = attr[1]->zero_point;
output_scale = attr[1]->asymm.scale; output_scale = attr[1]->scale;
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
{ {
@ -933,9 +933,14 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer)
gpu_param.global_scale[0] = 1; gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1; gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1; gpu_param.global_scale[2] = 1;
if (element_size < 8)
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) {
/ gpu_param.global_scale[0], 4); gpu_param.global_size[0] = element_size;
}
else
{
gpu_param.global_size[0] = width;
}
gpu_param.global_size[1] = 1; gpu_param.global_size[1] = 1;
gpu_param.global_size[2] = 1; gpu_param.global_size[2] = 1;
@ -1006,7 +1011,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
int32_t coord_dim = 0; int32_t coord_dim = 0;
int32_t strides[VSI_NN_MAX_DIM_NUM] = {0}; int32_t strides[VSI_NN_MAX_DIM_NUM] = {0};
int32_t coord_strides[8] = {0}; int32_t coord_strides[8] = {0};
int32_t *coord_strides1 = coord_strides + 4; int32_t coord_strides1[4] = {0};
int32_t input2_zp = 0; int32_t input2_zp = 0;
int32_t i = 0; int32_t i = 0;
@ -1046,13 +1051,14 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
width = block_size / 4; width = block_size / 4;
} }
input2_zp = attr[1]->asymm.zero_point; input2_zp = attr[1]->zero_point;
coord_strides[coord_dim - 1] = 1; coord_strides[coord_dim - 1] = 1;
for (i = 0; i < coord_dim - 1; i++) for (i = 0; i < coord_dim - 1; i++)
{ {
coord_strides[i] = strides[coord_dim - 2 - i]; coord_strides[i] = strides[coord_dim - 2 - i];
} }
memcpy(coord_strides1, coord_strides + 4, 4 * sizeof(int32_t));
gpu_param.global_scale[0] = 1; gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1; gpu_param.global_scale[1] = 1;
@ -1165,7 +1171,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
int32_t coord_dim = 0; int32_t coord_dim = 0;
int32_t strides[VSI_NN_MAX_DIM_NUM] = {0}; int32_t strides[VSI_NN_MAX_DIM_NUM] = {0};
int32_t coord_strides[8] = {0}; int32_t coord_strides[8] = {0};
int32_t *coord_strides1 = coord_strides + 4; int32_t coord_strides1[4] = {0};
float output_zp = 0; float output_zp = 0;
float input_scale = 1.0f; float input_scale = 1.0f;
float output_scale = 1.0f; float output_scale = 1.0f;
@ -1202,9 +1208,9 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
update_width = (int32_t)(attr[1]->shape->data[0]); update_width = (int32_t)(attr[1]->shape->data[0]);
index_num = (int32_t)(attr[0]->shape->data[1]); index_num = (int32_t)(attr[0]->shape->data[1]);
input_scale = attr[1]->asymm.scale; input_scale = attr[1]->scale;
output_scale = attr[2]->asymm.scale; output_scale = attr[2]->scale;
output_zp = (float)attr[2]->asymm.zero_point; output_zp = (float)attr[2]->zero_point;
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
{ {
input_scale = 1.0f; input_scale = 1.0f;
@ -1220,6 +1226,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
{ {
coord_strides[i] = strides[coord_dim - 2 - i]; coord_strides[i] = strides[coord_dim - 2 - i];
} }
memcpy(coord_strides1, coord_strides + 4, 4 * sizeof(int32_t));
width = block_size; width = block_size;
if (block_size % 4 == 0) if (block_size % 4 == 0)
@ -1337,9 +1344,14 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer)
gpu_param.global_scale[0] = 1; gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1; gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1; gpu_param.global_scale[2] = 1;
if (element_size < 8)
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) {
/ gpu_param.global_scale[0], 4); gpu_param.global_size[0] = element_size;
}
else
{
gpu_param.global_size[0] = width;
}
gpu_param.global_size[1] = 1; gpu_param.global_size[1] = 1;
gpu_param.global_size[2] = 1; gpu_param.global_size[2] = 1;

View File

@ -479,6 +479,7 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
vsi_size_t* temp_shape_y = NULL; vsi_size_t* temp_shape_y = NULL;
vsi_size_t* temp_shape_output = NULL; vsi_size_t* temp_shape_output = NULL;
vsi_size_t temp_rank = 0; vsi_size_t temp_rank = 0;
vsi_bool exceed_maxsize = FALSE;
#define _swap_size(a, b, tmp) \ #define _swap_size(a, b, tmp) \
{ \ { \
@ -490,6 +491,27 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
VSI_UNREFERENCED(rank_x); VSI_UNREFERENCED(rank_x);
VSI_UNREFERENCED(rank); VSI_UNREFERENCED(rank);
for (i = 0; i < rank_output; i++)
{
if (shape_output[i] > GPU_TENSOR_MAX_WIDTH)
{
exceed_maxsize = TRUE;
}
}
if (exceed_maxsize)
{
for (i = 0; i < rank_output; i++)
{
out_shape_x[i] = shape_x[i];
out_shape_y[i] = multiples[i];
out_shape_output[i] = shape_output[i];
}
*out_rank_output = rank_output;
ret = TRUE;
goto final;
}
temp_shape_x = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t)); temp_shape_x = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t));
if (temp_shape_x == NULL) if (temp_shape_x == NULL)
{ {

View File

@ -156,5 +156,17 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(inverse_sigmoid)
#if (VX_TENSOR_SELECT_VX_SUPPORT) #if (VX_TENSOR_SELECT_VX_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(select) REGISTER_VX_FIRST_KERNEL_SELECTOR(select)
#endif #endif
#if (VX_TENSOR_POW_API_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(pow)
#endif
#if (VX_TENSOR_GATHER_API_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(gather)
#endif
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops)
#endif
#if (VX_TENSOR_TILE_API_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(tile)
#endif
__END_DECLS __END_DECLS

View File

@ -0,0 +1,82 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (VX_TENSOR_GATHER_API_SUPPORT)
#define REGISTER_GATHEROPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_GATHEROPENVX_KERNEL( gather )
{
vx_node node = NULL;
int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
int32_t batch_dims = vsi_nn_kernel_param_get_int32(params, "batch_dims");
VSI_UNREFERENCED(kernel);
VSI_UNREFERENCED(params);
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
node = vxTensorGatherNode(graph->g,
inputs[0]->t,
inputs[1]->t,
axis,
batch_dims,
outputs[0]->t
);
return (vsi_nn_kernel_node_t)node;
} /* gather() */
#undef REGISTER_GATHEROPENVX_KERNEL
#endif

View File

@ -0,0 +1,73 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (VX_TENSOR_POW_API_SUPPORT)
#define REGISTER_POWOPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_POWOPENVX_KERNEL( pow )
{
vx_node node = vxTensorPowNode( graph->g, inputs[0]->t, inputs[1]->t,
outputs[0]->t );
VSI_UNREFERENCED(kernel);
VSI_UNREFERENCED(params);
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(input_num);
return (vsi_nn_kernel_node_t)node;
} /* pow() */
#undef REGISTER_POWOPENVX_KERNEL
#endif

View File

@ -0,0 +1,83 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
#define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( relational_ops )
{
vx_node node = NULL;
int32_t operation = vsi_nn_kernel_param_get_int32(params, "operation");
vx_tensor inputs_tensor[2] = {NULL};
inputs_tensor[0] = inputs[0]->t;
inputs_tensor[1] = inputs[1]->t;
VSI_UNREFERENCED(kernel);
VSI_UNREFERENCED(output_num);
node = vxRelationalLayer(graph->g,
operation,
inputs_tensor,
(uint32_t)input_num,
outputs[0]->t
);
return (vsi_nn_kernel_node_t)node;
} /* relational_ops() */
#undef REGISTER_RELATIONAL_OPS_OPENVX_KERNEL
#endif

View File

@ -0,0 +1,78 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (VX_TENSOR_TILE_API_SUPPORT)
#define REGISTER_TILEOPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_TILEOPENVX_KERNEL( tile )
{
vx_node node = NULL;
VSI_UNREFERENCED(kernel);
VSI_UNREFERENCED(params);
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
node = vxTensorTileNode(graph->g,
inputs[0]->t,
inputs[1]->t,
outputs[0]->t
);
return (vsi_nn_kernel_node_t)node;
} /* tile() */
#undef REGISTER_TILEOPENVX_KERNEL
#endif

View File

@ -88,6 +88,8 @@ __kernel void cumsum_##name##toU8_axis2( \
\ \
src_type sum = (src_type)(0); \ src_type sum = (src_type)(0); \
uint4 dst = (uint4)(0); \ uint4 dst = (uint4)(0); \
int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_uint_sat(tmp_zp); \
\ \
float cnt = 0.0f; \ float cnt = 0.0f; \
\ \
@ -252,6 +254,8 @@ __kernel void cumsum_##name##toU8_axis1( \
\ \
src_type sum = (src_type)(0); \ src_type sum = (src_type)(0); \
uint4 dst = (uint4)(0); \ uint4 dst = (uint4)(0); \
int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_uint_sat(tmp_zp); \
\ \
float cnt = 0; \ float cnt = 0; \
\ \
@ -416,6 +420,8 @@ __kernel void cumsum_##name##toU8_axis0( \
\ \
src_type sum = (src_type)(0); \ src_type sum = (src_type)(0); \
uint4 dst = (uint4)(0); \ uint4 dst = (uint4)(0); \
int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_uint_sat(tmp_zp); \
\ \
float cnt = 0; \ float cnt = 0; \
\ \

View File

@ -85,12 +85,15 @@ __kernel void cumsum_U8toU8_axis1_2D(
uint4 sum = (uint4)(0); uint4 sum = (uint4)(0);
uint4 dst = (uint4)(0); uint4 dst = (uint4)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0; float cnt = 0;
if(exclusive && rev) if(exclusive && rev)
{ {
coord.w = height - 1; coord.w = height - 1;
write_imageui(output, coord.zw, sum); write_imageui(output, coord.zw, dst);
for(coord.y = height - 1; coord.y > 0; coord.y--) for(coord.y = height - 1; coord.y > 0; coord.y--)
{ {
uint4 data = read_imageui(input, coord.xy); uint4 data = read_imageui(input, coord.xy);
@ -107,7 +110,7 @@ __kernel void cumsum_U8toU8_axis1_2D(
} }
else if(exclusive) else if(exclusive)
{ {
write_imageui(output, coord.zw, sum); write_imageui(output, coord.zw, dst);
for(coord.y = 0; coord.y < height - 1; coord.y++) for(coord.y = 0; coord.y < height - 1; coord.y++)
{ {
uint4 data = read_imageui(input, coord.xy); uint4 data = read_imageui(input, coord.xy);
@ -173,6 +176,8 @@ __kernel void cumsum_F32toU8_axis1_2D(
float4 sum = (float4)(0); float4 sum = (float4)(0);
uint4 dst = (uint4)(0); uint4 dst = (uint4)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0; float cnt = 0;
@ -331,13 +336,16 @@ __kernel void cumsum_U8toU8_axis0_2D(
uint4 sum = (uint4)(0); uint4 sum = (uint4)(0);
uint4 dst = (uint4)(0); uint4 dst = (uint4)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0.0f; float cnt = 0.0f;
if(exclusive && rev) if(exclusive && rev)
{ {
coord.x = width - 1; coord.x = width - 1;
coord.z = coord.x; coord.z = coord.x;
write_imageui(output, coord.zw, sum); write_imageui(output, coord.zw, dst);
for(; coord.x > 0; coord.x--) for(; coord.x > 0; coord.x--)
{ {
uint4 data = read_imageui(input, coord.xy); uint4 data = read_imageui(input, coord.xy);
@ -355,7 +363,7 @@ __kernel void cumsum_U8toU8_axis0_2D(
else if(exclusive) else if(exclusive)
{ {
coord.z = 0; coord.z = 0;
write_imageui(output, coord.zw, sum); write_imageui(output, coord.zw, dst);
for(coord.x = 0; coord.x < width - 1; coord.x++) for(coord.x = 0; coord.x < width - 1; coord.x++)
{ {
uint4 data = read_imageui(input, coord.xy); uint4 data = read_imageui(input, coord.xy);
@ -421,9 +429,10 @@ __kernel void cumsum_F32toU8_axis0_2D(
float4 sum = (float4)(0); float4 sum = (float4)(0);
uint4 dst = (uint4)(0); uint4 dst = (uint4)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0.0f; float cnt = 0.0f;
if(exclusive && rev) if(exclusive && rev)
{ {
coord.x = width - 1; coord.x = width - 1;

View File

@ -1,3 +1,6 @@
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
#include "cl_viv_vx_ext.h"
#define rlogE (0.693147182f) #define rlogE (0.693147182f)
float LOG(float x) float LOG(float x)
{ {
@ -5,16 +8,11 @@ float LOG(float x)
return x * rlogE; return x * rlogE;
} }
__kernel void log_softmax_axis0_F32toF32 __kernel void log_softmax_axis0_F32toF32(
(
__read_only image2d_array_t input, __read_only image2d_array_t input,
__write_only image2d_array_t output, __write_only image2d_array_t output,
int axis, int axis, float beta,
float beta, float scale, float scaleOut, float zpOut)
float scale,
float scaleOut,
float zpOut
)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -58,16 +56,11 @@ __kernel void log_softmax_axis0_F32toF32
} }
} }
__kernel void log_softmax_axis0_F32toF32_2D __kernel void log_softmax_axis0_F32toF32_2D(
(
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
int axis, int axis, float beta,
float beta, float scale, float scaleOut, float zpOut)
float scale,
float scaleOut,
float zpOut
)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -110,16 +103,11 @@ __kernel void log_softmax_axis0_F32toF32_2D
} }
} }
__kernel void log_softmax_axis0_U8toU8 __kernel void log_softmax_axis0_U8toU8(
(
__read_only image2d_array_t input, __read_only image2d_array_t input,
__write_only image2d_array_t output, __write_only image2d_array_t output,
int axis, int axis, float beta,
float beta, float scale, float scaleOut, float zpOut)
float scale,
float scaleOut,
float zpOut
)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -165,16 +153,11 @@ __kernel void log_softmax_axis0_U8toU8
} }
} }
__kernel void log_softmax_axis0_U8toU8_2D __kernel void log_softmax_axis0_U8toU8_2D(
(
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
int axis, int axis, float beta,
float beta, float scale, float scaleOut, float zpOut)
float scale,
float scaleOut,
float zpOut
)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -217,4 +200,109 @@ __kernel void log_softmax_axis0_U8toU8_2D
coord_in.x++; coord_in.x++;
} }
} }
__kernel void log_softmax_axis0_BF16toBF16(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, float beta,
float scale, float scaleOut, float zpOut)
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
int width = get_image_width(input);
int4 coord_in = (int4)(0, y, z, 0);
float4 maxValue, src, dst = {0.0};
uint4 data, val, out;
data = read_imageui(input, coord_in);
data = data << 16;
_viv_asm(COPY, maxValue, data, 16);
for (coord_in.x = 1; coord_in.x < width; )
{
data = read_imageui(input, coord_in);
coord_in.x++;
data = data << 16;
_viv_asm(COPY, src, data, 16);
maxValue = maxValue > src ? maxValue : src;
}
float sum = 0.f;
for (coord_in.x = 0; coord_in.x < width; )
{
data = read_imageui(input, coord_in);
coord_in.x++;
data = data << 16;
_viv_asm(COPY, src, data, 16);
sum += exp2((src.x - maxValue.x) * scale);
}
float logSum = LOG(sum);
for (coord_in.x = 0; coord_in.x < width; )
{
data = read_imageui(input, coord_in);
data = data << 16;
_viv_asm(COPY, src, data, 16);
dst.x = (src.x - maxValue.x) * beta - logSum;
_viv_asm(COPY, val, dst, 16);
out = val >> 16;
write_imageui(output, coord_in, out);
coord_in.x++;
}
}
__kernel void log_softmax_axis0_BF16toBF16_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis, float beta,
float scale, float scaleOut, float zpOut)
{
int x = get_global_id(0);
int y = get_global_id(1);
int width = get_image_width(input);
int2 coord_in = (int2)(0, y);
float4 maxValue, src, dst = {0.0};
uint4 data, val, out;
data = read_imageui(input, coord_in);
data = data << 16;
_viv_asm(COPY, maxValue, data, 16);
for (coord_in.x = 1; coord_in.x < width; )
{
data = read_imageui(input, coord_in);
coord_in.x++;
data = data << 16;
_viv_asm(COPY, src, data, 16);
maxValue = maxValue > src ? maxValue : src;
}
float sum = 0.0f;
for (coord_in.x = 0; coord_in.x < width; )
{
data = read_imageui(input, coord_in);
coord_in.x++;
data = data << 16;
_viv_asm(COPY, src, data, 16);
sum += exp2((src.x - maxValue.x) * scale);
}
float logSum = LOG(sum);
for (coord_in.x = 0; coord_in.x < width; )
{
data = read_imageui(input, coord_in);
data = data << 16;
_viv_asm(COPY, src, data, 16);
dst.x = (src.x - maxValue.x) * beta - logSum;
_viv_asm(COPY, val, dst, 16);
out = val >> 16;
write_imageui(output, coord_in, out);
coord_in.x++;
}
}
#undef rlogE #undef rlogE

View File

@ -1,3 +1,6 @@
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
#include "cl_viv_vx_ext.h"
#define rlogE (0.693147182f) #define rlogE (0.693147182f)
float LOG(float x) float LOG(float x)
@ -6,16 +9,11 @@ float LOG(float x)
return x * rlogE; return x * rlogE;
} }
__kernel void log_softmax_axis1_F32toF32 __kernel void log_softmax_axis1_F32toF32(
(
__read_only image2d_array_t input, __read_only image2d_array_t input,
__write_only image2d_array_t output, __write_only image2d_array_t output,
int axis, int axis, float beta,
float beta, float scale, float scaleOut, float zpOut)
float scale,
float scaleOut,
float zpOut
)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -59,16 +57,11 @@ __kernel void log_softmax_axis1_F32toF32
} }
} }
__kernel void log_softmax_axis1_F32toF32_2D __kernel void log_softmax_axis1_F32toF32_2D(
(
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
int axis, int axis, float beta,
float beta, float scale, float scaleOut, float zpOut)
float scale,
float scaleOut,
float zpOut
)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -111,16 +104,11 @@ __kernel void log_softmax_axis1_F32toF32_2D
} }
} }
__kernel void log_softmax_axis1_U8toU8 __kernel void log_softmax_axis1_U8toU8(
(
__read_only image2d_array_t input, __read_only image2d_array_t input,
__write_only image2d_array_t output, __write_only image2d_array_t output,
int axis, int axis, float beta,
float beta, float scale, float scaleOut, float zpOut)
float scale,
float scaleOut,
float zpOut
)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -166,16 +154,11 @@ __kernel void log_softmax_axis1_U8toU8
} }
} }
__kernel void log_softmax_axis1_U8toU8_2D __kernel void log_softmax_axis1_U8toU8_2D(
(
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
int axis, int axis, float beta,
float beta, float scale, float scaleOut, float zpOut)
float scale,
float scaleOut,
float zpOut
)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -218,4 +201,111 @@ __kernel void log_softmax_axis1_U8toU8_2D
coord_in.y++; coord_in.y++;
} }
} }
__kernel void log_softmax_axis1_BF16oBF16(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis, float beta,
float scale, float scaleOut, float zpOut)
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
int height = get_image_height(input);
int4 coord_in = (int4)(x, 0, z, 0);
float4 maxValue, src, dst = {0.0};
uint4 data, val, out;
data = read_imageui(input, coord_in);
data = data << 16;
_viv_asm(COPY, maxValue, data, 16);
for (coord_in.y = 1; coord_in.y < height; )
{
data = read_imageui(input, coord_in);
coord_in.y++;
data = data << 16;
_viv_asm(COPY, src, data, 16);
maxValue = maxValue > src ? maxValue : src;
}
float sum = 0.f;
for (coord_in.y = 0; coord_in.y < height; )
{
data = read_imageui(input, coord_in);
coord_in.y++;
data = data << 16;
_viv_asm(COPY, src, data, 16);
sum += exp2((src.x - maxValue.x) * scale);
}
float logSum = LOG(sum);
for (coord_in.y = 0; coord_in.y < height; )
{
data = read_imageui(input, coord_in);
data = data << 16;
_viv_asm(COPY, src, data, 16);
dst.x = (src.x - maxValue.x) * beta - logSum;
_viv_asm(COPY, val, dst, 16);
out = val >> 16;
write_imageui(output, coord_in, out);
coord_in.y++;
}
}
__kernel void log_softmax_axis1_BF16toBF16_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis, float beta,
float scale, float scaleOut, float zpOut)
{
int x = get_global_id(0);
int y = get_global_id(1);
int height = get_image_height(input);
int2 coord_in = (int2)(x, 0);
float4 maxValue, src, dst = {0.0};
uint4 data, val, out;
data = read_imageui(input, coord_in);
data = data << 16;
_viv_asm(COPY, maxValue, data, 16);
for (coord_in.y = 1; coord_in.y < height; )
{
data = read_imageui(input, coord_in);
coord_in.y++;
data = data << 16;
_viv_asm(COPY, src, data, 16);
maxValue = maxValue > src ? maxValue : src;
}
float sum = 0.0f;
for (coord_in.y = 0; coord_in.y < height; )
{
data = read_imageui(input, coord_in);
coord_in.y++;
data = data << 16;
_viv_asm(COPY, src, data, 16);
sum += exp2((src.x - maxValue.x) * scale);
}
float logSum = 1.0f * LOG(sum);
for (coord_in.y = 0; coord_in.y < height; )
{
data = read_imageui(input, coord_in);
data = data << 16;
_viv_asm(COPY, src, data, 16);
dst.x = (src.x - maxValue.x) * beta - logSum;
_viv_asm(COPY, val, dst, 16);
out = val >> 16;
write_imageui(output, coord_in, out);
coord_in.y++;
}
}
#undef rlogE #undef rlogE

View File

@ -1,3 +1,6 @@
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
#include "cl_viv_vx_ext.h"
#define rlogE (0.693147182f) #define rlogE (0.693147182f)
float LOG(float x) float LOG(float x)
{ {
@ -112,4 +115,68 @@ __kernel void log_softmax_axis2_U8toU8
coord_in.z++; coord_in.z++;
} }
} }
__kernel void log_softmax_axis2_BF16toBF16
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis,
float beta,
float scale,
float scaleOut,
float zpOut
)
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
int depth = get_image_array_size(input);
int4 coord_in = (int4)(x, y, 0, 0);
float4 maxValue;
float4 src, dst = {0.0};
uint4 data, val, out;
// Find max element value which we'll use to ensure numerical stability
// taking advantage of the following equality:
// exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
data = read_imageui(input, coord_in);
data = data << 16;
_viv_asm(COPY, maxValue, data, 16);
for (coord_in.z = 1; coord_in.z < depth; )
{
data = read_imageui(input, coord_in);
coord_in.z++;
data = data << 16;
_viv_asm(COPY, src, data, 16);
maxValue = maxValue > src ? maxValue : src;
}
// Compute sum.
float sum = 0.f;
for (coord_in.z = 0; coord_in.z < depth; )
{
data = read_imageui(input, coord_in);
coord_in.z++;
data = data << 16;
_viv_asm(COPY, src, data, 16);
sum += exp2((src.x - maxValue.x) * scale);
}
// Compute result.
float logSum = LOG(sum);
for (coord_in.z = 0; coord_in.z < depth; )
{
data = read_imageui(input, coord_in);
data = data << 16;
_viv_asm(COPY, src, data, 16);
dst.x = (src.x - maxValue.x) * beta - logSum;
_viv_asm(COPY, val, dst, 16);
out = val >> 16;
write_imageui(output, coord_in, out);
coord_in.z++;
}
}
#undef rlogE #undef rlogE

View File

@ -0,0 +1,127 @@
#pragma OPENCL EXTENSION CL_VIV_asm : enable
__kernel void gemm_4x_F32F32toF32_2D(
__read_only image2d_t inputA,
__read_only image2d_t inputB,
__write_only image2d_t output,
int M,
int K,
int N,
int ac2zero,
int bc2zero,
float scale_a,
float zp_a,
float scale_b,
float zp_b,
float scale_out,
float zp_out
)
{
int offset0 = get_global_id(0) * K;
int offset1 = offset0 + K;
int offset2 = offset1 + K;
int offset3 = offset2 + K;
int out_offset = get_global_id(0);
int z = 0;
float4 sum = (float4)(0, 0, 0, 0);
Image in0_tensor = create_image_from_image2d(inputA, 4);
__global float* in0_ptr = (__global float*)in0_tensor.ptr;
__global float* in0_ptr0 = in0_ptr + offset0;
__global float* in0_ptr1 = in0_ptr + offset1;
__global float* in0_ptr2 = in0_ptr + offset2;
__global float* in0_ptr3 = in0_ptr + offset3;
Image in1_tensor = create_image_from_image2d(inputB, 4);
__global float* in1_ptr = (__global float*)in1_tensor.ptr;
Image o_tensor = create_image_from_image2d(output, 4);
__global float* output_ptr = (__global float*)o_tensor.ptr + out_offset;
int step = K >> 2;
for(z = 0; z < step; z++)
{
float4 tempA0, tempA1, tempA2, tempA3;
float4 tempB0;
tempB0 = vload4(z, in1_ptr);
tempA0 = vload4(z, in0_ptr0);
tempA1 = vload4(z, in0_ptr1);
tempA2 = vload4(z, in0_ptr2);
tempA3 = vload4(z, in0_ptr3);
sum.x += dot(tempA0, tempB0);
sum.y += dot(tempA1, tempB0);
sum.z += dot(tempA2, tempB0);
sum.w += dot(tempA3, tempB0);
}
vstore4(sum, 0, output_ptr);
}
__kernel void gemm_4x_transa_F32F32toF32_2D(
__read_only image2d_t inputA,
__read_only image2d_t inputB,
__write_only image2d_t output,
int M,
int K,
int N,
int ac2zero,
int bc2zero,
float scale_a,
float zp_a,
float scale_b,
float zp_b,
float scale_out,
float zp_out
)
{
int offset0 = get_global_id(0);
int offset1 = M << 2;
int z = 0;
float4 sum = (float4)(0, 0, 0, 0);
Image in0_tensor = create_image_from_image2d(inputA, 4);
__global float* in0_ptr0 = (__global float*)in0_tensor.ptr + offset0;
__global float* in0_ptr1 = in0_ptr0 + M;
__global float* in0_ptr2 = in0_ptr1 + M;
__global float* in0_ptr3 = in0_ptr2 + M;
Image in1_tensor = create_image_from_image2d(inputB, 4);
__global float* in1_ptr = (__global float*)in1_tensor.ptr;
Image o_tensor = create_image_from_image2d(output, 4);
__global float* output_ptr = (__global float*)o_tensor.ptr + offset0;
int step = K >> 2;
for(z = 0; z < step; z++)
{
float4 tempA0, tempA1, tempA2, tempA3;
float4 tempB0;
tempB0 = vload4(z, in1_ptr);
tempA0 = vload4(0, in0_ptr0);
tempA1 = vload4(0, in0_ptr1);
tempA2 = vload4(0, in0_ptr2);
tempA3 = vload4(0, in0_ptr3);
sum += tempA0 * tempB0.x;
sum += tempA1 * tempB0.y;
sum += tempA2 * tempB0.z;
sum += tempA3 * tempB0.w;
in0_ptr0 = in0_ptr0 + offset1;
in0_ptr1 = in0_ptr1 + offset1;
in0_ptr2 = in0_ptr2 + offset1;
in0_ptr3 = in0_ptr3 + offset1;
}
vstore4(sum, 0, output_ptr);
}

View File

@ -0,0 +1,217 @@
#define VSI_FLOAT32_MIN (1.175494351e-38F)
#define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val, read_func, write_func, conv_func) \
__kernel void maxpool_##in_name##to##out_name( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int width, \
int height, \
int stride_x, \
int stride_y, \
int pad_x, \
int pad_y, \
int kernel_dia_x, \
int kernel_dia_y, \
int dilation_x, \
int dilation_y, \
float inout_scale, \
float inout_tail) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \
int4 coord_in = coord_out; \
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \
\
for(; pos_start.x < 0;) \
{ \
pos_start.x += dilation_x; \
} \
for(; pos_start.y < 0;) \
{ \
pos_start.y += dilation_y; \
} \
\
pos_end = min(pos_end, (int2)(width, height)); \
\
src_type src0, maxVal; \
maxVal.x = max_val; \
\
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \
{ \
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \
{ \
src0 = read_func(input, coord_in); \
coord_in.x += dilation_x; \
maxVal = max(src0, maxVal); \
} \
} \
\
float4 fValTmp; \
fValTmp.x = maxVal.x * inout_scale + inout_tail; \
dst_type dst = conv_func(fValTmp); \
write_func(output, coord_out, dst.xxxx); \
}
MAXPOOL_QINT(U32, U32, uint4, uint4, 0, read_imageui, write_imageui, convert_uint4_rte)
MAXPOOL_QINT(I32, I32, int4, int4, -2147483648, read_imagei, write_imagei, convert_int4_rte)
__kernel void maxpool_F32toF32(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int width,
int height,
int stride_x,
int stride_y,
int pad_x,
int pad_y,
int kernel_dia_x,
int kernel_dia_y,
int dilation_x,
int dilation_y,
float inout_scale,
float inout_tail)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int4 coord_out = (int4)(gidx, gidy, gidz, 0);
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);
int4 coord_in = coord_out;
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);
for(; pos_start.x < 0;)
{
pos_start.x += dilation_x;
}
for(; pos_start.y < 0;)
{
pos_start.y += dilation_y;
}
pos_end = min(pos_end, (int2)(width, height));
float4 src0, maxVal;
maxVal.x = VSI_FLOAT32_MIN;
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)
{
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)
{
src0 = read_imagef(input, coord_in);
coord_in.x += dilation_x;
maxVal = max(src0, maxVal);
}
}
write_imagef(output, coord_out, maxVal.xxxx);
}
__kernel void maxpool_U32toF32(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int width,
int height,
int stride_x,
int stride_y,
int pad_x,
int pad_y,
int kernel_dia_x,
int kernel_dia_y,
int dilation_x,
int dilation_y,
float inout_scale,
float inout_tail)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int4 coord_out = (int4)(gidx, gidy, gidz, 0);
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);
int4 coord_in = coord_out;
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);
for(; pos_start.x < 0;)
{
pos_start.x += dilation_x;
}
for(; pos_start.y < 0;)
{
pos_start.y += dilation_y;
}
pos_end = min(pos_end, (int2)(width, height));
uint4 src0, maxVal;
maxVal.x = 0;
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)
{
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)
{
src0 = read_imageui(input, coord_in);
coord_in.x += dilation_x;
maxVal = max(src0, maxVal);
}
}
float4 dst;
dst.x = maxVal.x * inout_scale + inout_tail;
write_imagef(output, coord_out, dst.xxxx);
}
__kernel void maxpool_F32toU32(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int width,
int height,
int stride_x,
int stride_y,
int pad_x,
int pad_y,
int kernel_dia_x,
int kernel_dia_y,
int dilation_x,
int dilation_y,
float inout_scale,
float inout_tail)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int4 coord_out = (int4)(gidx, gidy, gidz, 0);
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);
int4 coord_in = coord_out;
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);
for(; pos_start.x < 0;)
{
pos_start.x += dilation_x;
}
for(; pos_start.y < 0;)
{
pos_start.y += dilation_y;
}
pos_end = min(pos_end, (int2)(width, height));
float4 src0, maxVal;
maxVal.x = VSI_FLOAT32_MIN;
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)
{
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)
{
src0 = read_imagef(input, coord_in);
coord_in.x += dilation_x;
maxVal = max(src0, maxVal);
}
}
uint4 dst;
dst.x = convert_uint_rte(maxVal.x * inout_scale + inout_tail);
write_imageui(output, coord_out, dst.xxxx);
}

View File

@ -232,3 +232,66 @@ __kernel void moments_axis01_BF16toF32(
write_imagef(output_vari, coord_out, vari); write_imagef(output_vari, coord_out, vari);
} }
} }
__kernel __attribute__((reqd_work_group_size(8, 8, 1))) void moments_axis12_U8toF32(
image2d_array_t input, image2d_array_t output_mean, image2d_array_t output_vari,
int axis, int axis_num, int input_zp, float input_scale,
int width, int height, int chn, float dimRatio
)
{
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int gidz = get_global_id(2); // width
int4 coord = (int4)(gidz, lidx, lidy, 0);
uint4 data;
float sum = 0, sqr = 0;
float e2InScale = input_scale * input_scale;
__local uint lcl_sumSqr[128];
__local uint lcl_sumSqr1[32];
uint2 tmpSumSqr = 0;
for(coord.z = lidy; coord.z < chn; coord.z += 8)
{
for(coord.y = lidx; coord.y < height;)
{
data = read_imageui(input, coord);
coord.y += 8;
tmpSumSqr = tmpSumSqr + (uint2)(data.x, data.x * data.x);
}
//sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;
//sum += (tmpSum - height * input_zp) * input_scale;
}
int index = lidx + lidy * 8;
vstore2(tmpSumSqr, index, lcl_sumSqr);
barrier(CLK_LOCAL_MEM_FENCE);
if(index < 16)
{
uint4 val0 = vload4(index, lcl_sumSqr);
uint4 val1 = vload4(index, lcl_sumSqr + 64);
val0 += val1;
uint2 val2 = val0.xy + val0.zw;
vstore2(val2, index, lcl_sumSqr1);
}
barrier(CLK_LOCAL_MEM_FENCE);
if(index == 0)
{
uint4 val0 = 0;
for(int i = 0; i < 8; i++)
{
val0 += vload4(i, lcl_sumSqr1);
}
float2 tmpVal = convert_float2(val0.xy + val0.zw);
sum = (tmpVal.x - height * chn * input_zp) * input_scale;
sqr = (tmpVal.y - 2 * input_zp * tmpVal.x + height * chn * input_zp * input_zp) * e2InScale;
float4 mean, vari;
mean.x = sum * dimRatio;
vari.x = sqr * dimRatio;
vari.x = vari.x - mean.x * mean.x;
write_imagef(output_mean, coord.xwww, mean);
write_imagef(output_vari, coord.xwww, vari);
}
}

View File

@ -5,157 +5,6 @@
_viv_uniform float4 matrix0; _viv_uniform float4 matrix0;
_viv_uniform float2 matrix1; _viv_uniform float2 matrix1;
_viv_uniform float4 matrix4; _viv_uniform float4 matrix4;
__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
vxc_uchar16 dst;
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_warp_affine_bilinear_U8toU8_2D
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
vxc_uchar16 src0, src1, dst;
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#endif
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_warp_affine_nearest_neighbor_U8toU8 __kernel void custom_warp_affine_nearest_neighbor_U8toU8
( (

View File

@ -0,0 +1,158 @@
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
#include "cl_viv_vx_ext.h"
_viv_uniform float4 matrix0;
_viv_uniform float2 matrix1;
_viv_uniform float4 matrix4;
__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
vxc_uchar16 dst;
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_warp_affine_bilinear_U8toU8_2D
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
vxc_uchar16 src0, src1, dst;
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#endif
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,341 @@
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
#include "cl_viv_vx_ext.h"
#define WARP_AFFINE(name) \
__kernel void custom_warp_affine_##name \
( \
__read_only image2d_array_t input, \
__read_only image2d_t matrix, \
__write_only image2d_array_t output, \
float _m0, \
float _m1, \
float _m2, \
float _m3, \
float _m4, \
float _m5 \
) \
#define GET_MATRIX_VALUE \
float4 matrix0; \
float2 matrix1; \
float4 matrix4; \
int2 coord_matrix = (int2)(0,0); \
Image img1 = create_image_from_image2d(matrix, 4); \
__global float* matrix_ptr = (__global float*)img1.ptr; \
matrix0 = vload4(0, matrix_ptr); \
matrix1 = vload2(2, matrix_ptr); \
matrix4.x = matrix0.x; \
matrix4.y = matrix0.y; \
matrix4.z = matrix0.x * 2; \
matrix4.w = matrix0.y * 2; \
WARP_AFFINE(nearest_neighbor_U8toU8_2D_optional_input)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
GET_MATRIX_VALUE
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
vxc_uchar16 dst;
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
WARP_AFFINE(bilinear_U8toU8_2D_optional_input)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
GET_MATRIX_VALUE
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
vxc_uchar16 src0, src1, dst;
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#endif
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
WARP_AFFINE(nearest_neighbor_U8toU8_optional_input)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
GET_MATRIX_VALUE
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_input.w, baseAddr);
vxc_uchar16 dst;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
WARP_AFFINE(bilinear_U8toU8_optional_input)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
GET_MATRIX_VALUE
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_input.w, baseAddr);
vxc_uchar16 src0, src1, dst;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#endif
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,333 @@
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
#include "cl_viv_vx_ext.h"
#define GET_MATRIX_VALUE \
float4 matrix0; \
float2 matrix1; \
Image img1 = create_image_from_image2d(matrix, 4); \
__global float* matrix_ptr = (__global float*)img1.ptr; \
matrix0 = vload4(0, matrix_ptr); \
matrix1 = vload2(2, matrix_ptr); \
__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_2D_optional_input
(
__read_only image2d_array_t input,
__read_only image2d_t matrix,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
int2 coord_matrix = (int2)(0,0);
GET_MATRIX_VALUE
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in.x = floor(coord_f.x) * 3;
coord_in.y = floor(coord_f.y);
coord_in.z = floor(coord_f.z) * 3;
coord_in.w = floor(coord_f.w);
vxc_uchar16 dst;
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = coord_in.x + 1;
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = coord_in.x + 1;
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.z = coord_in.z + 1;
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
coord_in.z = coord_in.z + 1;
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_warp_affine_bilinear_U8toU8_rgb_2D_optional_input
(
__read_only image2d_array_t input,
__read_only image2d_t matrix,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
int2 coord_matrix = (int2)(0,0);
GET_MATRIX_VALUE
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in.x = floor(coord_f.x) * 3;
coord_in.y = floor(coord_f.y);
coord_in.z = floor(coord_f.z) * 3;
coord_in.w = floor(coord_f.w);
vxc_uchar16 src0, src1, src_0, src_1, dst;
VXC_ReadImage(src_0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src_1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
src0.x = src_0.s0;
src0.y = src_0.s3;
src1.x = src_1.s0;
src1.y = src_1.s3;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#endif
src0.x = src_0.s1;
src0.y = src_0.s4;
src1.x = src_1.s1;
src1.y = src_1.s4;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#endif
src0.x = src_0.s2;
src0.y = src_0.s5;
src1.x = src_1.s2;
src1.y = src_1.s5;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src_0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src_1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
src0.x = src_0.s0;
src0.y = src_0.s3;
src1.x = src_1.s0;
src1.y = src_1.s3;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#endif
src0.x = src_0.s1;
src0.y = src_0.s4;
src1.x = src_1.s1;
src1.y = src_1.s4;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#endif
src0.x = src_0.s2;
src0.y = src_0.s5;
src1.x = src_1.s2;
src1.y = src_1.s5;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#endif
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_optional_input
(
__read_only image2d_array_t input,
__read_only image2d_t matrix,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
int2 coord_matrix = (int2)(0,0);
GET_MATRIX_VALUE
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in.x = floor(coord_f.x) * 3;
coord_in.y = floor(coord_f.y);
coord_in.z = floor(coord_f.z) * 3;
coord_in.w = floor(coord_f.w);
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_input.w, baseAddr);
vxc_uchar16 dst;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_input.x = coord_input.x + 1;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_input.x = coord_input.x + 1;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_input.x = coord_input.x + 1;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
coord_input.x = coord_input.x + 1;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_warp_affine_bilinear_U8toU8_rgb_optional_input
(
__read_only image2d_array_t input,
__read_only image2d_t matrix,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
int2 coord_matrix = (int2)(0,0);
GET_MATRIX_VALUE
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in.x = floor(coord_f.x) * 3;
coord_in.y = floor(coord_f.y);
coord_in.z = floor(coord_f.z) * 3;
coord_in.w = floor(coord_f.w);
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_input.w, baseAddr);
vxc_uchar16 src0, src1, src_0, src_1, dst;
VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
src0.x = src_0.s0;
src0.y = src_0.s3;
src1.x = src_1.s0;
src1.y = src_1.s3;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#endif
src0.x = src_0.s1;
src0.y = src_0.s4;
src1.x = src_1.s1;
src1.y = src_1.s4;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#endif
src0.x = src_0.s2;
src0.y = src_0.s5;
src1.x = src_1.s2;
src1.y = src_1.s5;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
src0.x = src_0.s0;
src0.y = src_0.s3;
src1.x = src_1.s0;
src1.y = src_1.s3;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#endif
src0.x = src_0.s1;
src0.y = src_0.s4;
src1.x = src_1.s1;
src1.y = src_1.s4;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#endif
src0.x = src_0.s2;
src0.y = src_0.s5;
src1.x = src_1.s2;
src1.y = src_1.s5;
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#endif
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
}

View File

@ -1,6 +1,8 @@
#include "cl_viv_vx_ext.h" #include "cl_viv_vx_ext.h"
_viv_uniform int indices_num; _viv_uniform int indices_num;
_viv_uniform int remainder;
_viv_uniform int width;
_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8; _viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;
__kernel void gather_I8toI8_array( __kernel void gather_I8toI8_array(
@ -131,10 +133,12 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \
int axis_num \ int axis_num \
) \ ) \
{ \ { \
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
\
if (coord.x >= width) return; \
Image img0 = create_image_from_image2d(input0, 1); \ Image img0 = create_image_from_image2d(input0, 1); \
Image img1 = create_image_from_image2d(input1, 4); \ Image img1 = create_image_from_image2d(input1, 4); \
Image img2 = create_image_from_image2d(output, 1); \ Image img2 = create_image_from_image2d(output, 1); \
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
uchar* index_ptr = get_image_ptr_from_coord(img1, coord.xz); \ uchar* index_ptr = get_image_ptr_from_coord(img1, coord.xz); \
__global int* index = (__global int*)index_ptr; \ __global int* index = (__global int*)index_ptr; \
int4 indices = vload4(0, index); \ int4 indices = vload4(0, index); \
@ -146,10 +150,30 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \
__global data_type* data_ptr = (__global data_type*)input_ptr; \ __global data_type* data_ptr = (__global data_type*)input_ptr; \
__global write_type* out_ptr = (__global write_type*)output_ptr; \ __global write_type* out_ptr = (__global write_type*)output_ptr; \
indices = indices >= 0 ? indices : indices + axis_num; \ indices = indices >= 0 ? indices : indices + axis_num; \
if (coord.x + remainder < width) \
{ \
src.s0 = data_ptr[indices.x]; \ src.s0 = data_ptr[indices.x]; \
src.s1 = data_ptr[indices.y]; \ src.s1 = data_ptr[indices.y]; \
src.s2 = data_ptr[indices.z]; \ src.s2 = data_ptr[indices.z]; \
src.s3 = data_ptr[indices.w]; \ src.s3 = data_ptr[indices.w]; \
} \
else \
{ \
__global data_type* out_ptr_remainder = (__global data_type*)output_ptr; \
switch (remainder) \
{ \
case 3: \
out_ptr_remainder[2] = data_ptr[indices.z]; \
case 2: \
out_ptr_remainder[1] = data_ptr[indices.y]; \
case 1: \
out_ptr_remainder[0] = data_ptr[indices.x]; \
break; \
default: \
break; \
} \
return; \
} \
\ \
VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniExtraCopyDpKeepinEvis_2x8); \ uniExtraCopyDpKeepinEvis_2x8); \

View File

@ -3,6 +3,9 @@
#define logE (1.44269502f) #define logE (1.44269502f)
#define twoLogE (2.88539004f) #define twoLogE (2.88539004f)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
float4 sigmoid(float4 x) float4 sigmoid(float4 x)
{ {
x *= -logE; x *= -logE;
@ -104,3 +107,53 @@ GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, U8, hsigmoid,
#undef UCHAR8 #undef UCHAR8
#undef SHORT8 #undef SHORT8
#undef HALF8 #undef HALF8
#define GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(activater) \
__kernel void grucell_activation_BF16_BF16_BF16_to_BF16_##activater \
( \
__read_only image2d_array_t input0, \
__read_only image2d_array_t input1, \
__read_only image2d_array_t input2, \
__write_only image2d_array_t output, \
__write_only image2d_array_t hstate, \
int gate_activation, \
int candidate_activation \
) \
{ \
vxc_short8 src00, src10, src20, data0, data1; \
float4 src01, src11, src21; \
\
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
VXC_ReadImage(src00, input0, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(src10, input1, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(src20, input2, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, src00, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src01, data0, 16); \
VXC_DP2x8(data1, src10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src11, data1, 16); \
VXC_DP2x8(data0, src20, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src21, data0, 16); \
\
src01 = src01 * tensorScale.xxxx - tensorZP.xxxx; \
src01 = activater(src01); \
\
src11 = src11 * tensorScale.yyyy - tensorZP.yyyy; \
src11 = tangentH(src11); \
\
src21 = src21 * tensorScale.zzzz - tensorZP.zzzz; \
\
src11 = src11 - src01 * src11; \
src11 = src01 * src21 + src11; \
\
src11 = src11 * tensorScale.wwww + tensorZP.wwww; \
_viv_asm(COPY, src00, src11, 16); \
VXC_DP2x8(data0, src00, src00, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(sigmoid)
GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(hsigmoid)

View File

@ -3,6 +3,11 @@
_viv_uniform VXC_512Bits uniA_Minus_B_2x8; _viv_uniform VXC_512Bits uniA_Minus_B_2x8;
_viv_uniform VXC_512Bits uniA_Times_B_2x8; _viv_uniform VXC_512Bits uniA_Times_B_2x8;
_viv_uniform VXC_512Bits uniA_Plus_B_2x8; _viv_uniform VXC_512Bits uniA_Plus_B_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
__kernel void grucell_activation_sma_F16_F16_F16toF16 __kernel void grucell_activation_sma_F16_F16_F16toF16
( (
__read_only image2d_array_t input0, __read_only image2d_array_t input0,
@ -61,3 +66,101 @@ __kernel void grucell_activation_sma_F16_F16_F16toF16_2D
VXC_WriteImage(h_status, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); VXC_WriteImage(h_status, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
} }
__kernel void grucell_activation_sma_BF16_BF16_BF16toBF16
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__read_only image2d_array_t input2,
__write_only image2d_array_t output,
__write_only image2d_array_t h_status
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
float4 src0, src00, src1, src11, src2, src22, minus, minus1, dst, dst1;
vxc_ushort8 vec0, vec1, vec2, data0, data1;
VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
VXC_DP2x8(data0, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
VXC_DP2x8(data1, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src0, data0, 16);
_viv_asm(COPY, src00, data1, 16);
VXC_DP2x8(data0, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
VXC_DP2x8(data1, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src1, data0, 16);
_viv_asm(COPY, src11, data1, 16);
VXC_DP2x8(data0, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
VXC_DP2x8(data1, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src2, data0, 16);
_viv_asm(COPY, src22, data1, 16);
minus = src0 - src1;
minus1 = src00 - src11;
dst = minus * src2 + src1;
dst1 = minus1 * src22 + src11;
_viv_asm(COPY, vec0, dst, 16);
_viv_asm(COPY, vec1, dst1, 16);
VXC_DP2x8(data0, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage2DArray(output, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
VXC_WriteImage(h_status, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
__kernel void grucell_activation_sma_BF16_BF16_BF16toBF16_2D
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__read_only image2d_array_t input2,
__write_only image2d_array_t output,
__write_only image2d_array_t h_status
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
float4 src0, src00, src1, src11, src2, src22, minus, minus1, dst, dst1;
vxc_ushort8 vec0, vec1, vec2, data0, data1;
VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
VXC_DP2x8(data0, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
VXC_DP2x8(data1, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src0, data0, 16);
_viv_asm(COPY, src00, data1, 16);
VXC_DP2x8(data0, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
VXC_DP2x8(data1, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src1, data0, 16);
_viv_asm(COPY, src11, data1, 16);
VXC_DP2x8(data0, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
VXC_DP2x8(data1, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src2, data0, 16);
_viv_asm(COPY, src22, data1, 16);
minus = src0 - src1;
minus1 = src00 - src11;
dst = minus * src2 + src1;
dst1 = minus1 * src22 + src11;
_viv_asm(COPY, vec0, dst, 16);
_viv_asm(COPY, vec1, dst1, 16);
VXC_DP2x8(data0, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
VXC_WriteImage(h_status, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -3,6 +3,9 @@
#define logE (1.44269502f) #define logE (1.44269502f)
#define twoLogE (logE * 2.0f) #define twoLogE (logE * 2.0f)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
float4 sigmoid_func(float4 x) float4 sigmoid_func(float4 x)
{ {
x *= -logE; x *= -logE;
@ -128,3 +131,52 @@ GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)
GRUCELL_QNT_F16TO_QNT(U8, U8, HSIGMOID, hard_sigmoid, vxc_uchar8, vxc_uchar8) GRUCELL_QNT_F16TO_QNT(U8, U8, HSIGMOID, hard_sigmoid, vxc_uchar8, vxc_uchar8)
GRUCELL_QNT_F16TO_QNT(I8, I8, HSIGMOID, hard_sigmoid, vxc_char8, vxc_char8) GRUCELL_QNT_F16TO_QNT(I8, I8, HSIGMOID, hard_sigmoid, vxc_char8, vxc_char8)
GRUCELL_QNT_F16TO_QNT(I16, I16, HSIGMOID, hard_sigmoid, vxc_short8, vxc_short8) GRUCELL_QNT_F16TO_QNT(I16, I16, HSIGMOID, hard_sigmoid, vxc_short8, vxc_short8)
#define GRUCELL_BF16(act_name, act_func) \
__kernel void grucell_activation_z_h_BF16_BF16toBF16_##act_name( \
__read_only image2d_t hstate_in, \
__read_only image2d_t input_z_conv, \
__read_only image2d_t input_h_conv, \
__read_only image2d_t hstate_z_conv, \
__read_only image2d_t hstate_h_conv, \
__write_only image2d_t output, \
__write_only image2d_t hstate_out \
) \
{ \
int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
vxc_short8 v0, v1, v2, v3, v4, v5, v6, data0, data1; \
float4 src0, src1, src2, src3, src4, src5, src6; \
VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src2, data0, 16); \
VXC_DP2x8(data1, v4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src4, data1, 16); \
VXC_DP2x8(data0, v5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src5, data0, 16); \
VXC_DP2x8(data1, v6, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src6, data1, 16); \
VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src3, data0, 16); \
\
float4 h = src2 + src4; \
float4 z = src5 + src6; \
h = tanh_func(h); \
z = act_func(z); \
float4 result = (1 - z) * h + z * src3; \
_viv_asm(COPY, v0, result, 16); \
VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_WriteImage(hstate_out, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
GRUCELL_BF16(SIGMOID, sigmoid_func)
GRUCELL_BF16(HSIGMOID, hard_sigmoid)

View File

@ -0,0 +1,344 @@
#include "cl_viv_vx_ext.h"
#define logE (1.44269502f)
#define twoLogE (2.88539004f)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_NC(
__read_only image2d_array_t prev_state,
__read_only image2d_array_t input_r,
__read_only image2d_array_t input_z,
__read_only image2d_array_t input_c,
__read_only image2d_array_t recur_r,
__read_only image2d_array_t recur_z,
__read_only image2d_array_t recur_c,
__read_only image2d_t bias_r,
__read_only image2d_t bias_z,
__read_only image2d_t bias_c,
__read_only image2d_t cond_r,
__read_only image2d_t cond_z,
__read_only image2d_t cond_c,
__write_only image2d_array_t output,
__write_only image2d_array_t hstate,
int gate_activation, int candidate_activation, int batch_first)
{
vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;
float4 r0, r1, z0, z1, c0, c1, state;
float4 r, r2, r3, z, z2, z3, c, c2, c3;
int2 coord = (int2)(get_global_id(0), get_global_id(1));
VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s7, prev_state, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
r2 = read_imagef(bias_r, coord);
r3 = read_imagef(cond_r, coord);
z2 = read_imagef(bias_z, coord);
z3 = read_imagef(cond_z, coord);
c2 = read_imagef(bias_c, coord);
c3 = read_imagef(cond_c, coord);
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, r0, data0, 16);
VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, r1, data1, 16);
VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, z0, data0, 16);
VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, z1, data1, 16);
VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, c0, data0, 16);
VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, c1, data1, 16);
VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, state, data0, 16);
r = r0 + r1 + r2 + r3;
z = z0 + z1 + z2 + z3;
r = sigmoid(r);
z = sigmoid(z);
c = c2 * r + c3;
c = c0 + c1 * r + c;
c = tangentH(c);
state = z * (state - c) + c;
_viv_asm(COPY, s0, state, 16);
VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_CN(
__read_only image2d_array_t prev_state,
__read_only image2d_array_t input_r,
__read_only image2d_array_t input_z,
__read_only image2d_array_t input_c,
__read_only image2d_array_t recur_r,
__read_only image2d_array_t recur_z,
__read_only image2d_array_t recur_c,
__read_only image2d_t bias_r,
__read_only image2d_t bias_z,
__read_only image2d_t bias_c,
__read_only image2d_t cond_r,
__read_only image2d_t cond_z,
__read_only image2d_t cond_c,
__write_only image2d_array_t output,
__write_only image2d_array_t hstate,
int gate_activation, int candidate_activation, int batch_first)
{
vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;
float4 r0, r1, z0, z1, c0, c1, state;
float4 r, r2, r3, z, z2, z3, c, c2, c3;
int2 coord = (int2)(get_global_id(0), get_global_id(1));
VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
r2 = read_imagef(bias_r, coord.yx);
r3 = read_imagef(cond_r, coord.yx);
z2 = read_imagef(bias_z, coord.yx);
z3 = read_imagef(cond_z, coord.yx);
c2 = read_imagef(bias_c, coord.yx);
c3 = read_imagef(cond_c, coord.yx);
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, r0, data0, 16);
VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, r1, data1, 16);
VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, z0, data0, 16);
VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, z1, data1, 16);
VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, c0, data0, 16);
VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, c1, data1, 16);
VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, state, data0, 16);
r = r0 + r1 + r2.xxxx + r3.xxxx;
z = z0 + z1 + z2.xxxx + z3.xxxx;
r = sigmoid(r);
z = sigmoid(z);
c = c2.xxxx * r + c3.xxxx;
c = c0 + c1 * r + c;
c = tangentH(c);
state = z * (state - c) + c;
_viv_asm(COPY, s0, state, 16);
VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord.x ++;
VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord.x ++;
VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord.x ++;
VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_CN_FULL(
__read_only image2d_array_t prev_state,
__read_only image2d_array_t input_r,
__read_only image2d_array_t input_z,
__read_only image2d_array_t input_c,
__read_only image2d_array_t recur_r,
__read_only image2d_array_t recur_z,
__read_only image2d_array_t recur_c,
__read_only image2d_t bias_r,
__read_only image2d_t bias_z,
__read_only image2d_t bias_c,
__read_only image2d_t cond_r,
__read_only image2d_t cond_z,
__read_only image2d_t cond_c,
__write_only image2d_array_t output,
__write_only image2d_array_t hstate,
int gate_activation, int candidate_activation, int batch_first)
{
vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;
float4 r0, r1, z0, z1, c0, c1, state;
float4 r, r2, r3, z, z2, z3, c, c2, c3;
int2 coord = (int2)(get_global_id(0), get_global_id(1));
VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s7, prev_state, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
r2 = read_imagef(bias_r, coord.yx);
r3 = read_imagef(cond_r, coord.yx);
z2 = read_imagef(bias_z, coord.yx);
z3 = read_imagef(cond_z, coord.yx);
c2 = read_imagef(bias_c, coord.yx);
c3 = read_imagef(cond_c, coord.yx);
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, r0, data0, 16);
VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, r1, data1, 16);
VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, z0, data0, 16);
VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, z1, data1, 16);
VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, c0, data0, 16);
VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, c1, data1, 16);
VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, state, data0, 16);
r = r0 + r1 + r2.xxxx + r3.xxxx;
z = z0 + z1 + z2.xxxx + z3.xxxx;
r = sigmoid(r);
z = sigmoid(z);
c = c2.xxxx * r + c3.xxxx;
c = c0 + c1 * r + c;
c = tangentH(c);
state = z * (state - c) + c;
_viv_asm(COPY, s0, state, 16);
VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void grucell_activation_cdnn_BF16_BF16_BF16_to_BF16(
__read_only image2d_array_t prev_state,
__read_only image2d_array_t input_rzc,
__read_only image2d_array_t recur_rzc,
__read_only image2d_t bias_r,
__read_only image2d_t bias_z,
__read_only image2d_t bias_c,
__read_only image2d_t cond_r,
__read_only image2d_t cond_z,
__read_only image2d_t cond_c,
__write_only image2d_array_t output,
__write_only image2d_array_t hstate,
int gate_activation, int candidate_activation, int batch_first)
{
vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;
float4 r0, r1, z0, z1, c0, c1, state;
float4 r, r2, r3, z, z2, z3, c, c2, c3;
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1) * 3, get_global_id(1));
VXC_ReadImage(s0, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s1, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s2, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s3, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s4, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s5, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(s7, prev_state, coord.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
r2 = read_imagef(bias_r, coord.xy);
r3 = read_imagef(cond_r, coord.xy);
z2 = read_imagef(bias_z, coord.xy);
z3 = read_imagef(cond_z, coord.xy);
c2 = read_imagef(bias_c, coord.xy);
c3 = read_imagef(cond_c, coord.xy);
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, r0, data0, 16);
VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, r1, data1, 16);
VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, z0, data0, 16);
VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, z1, data1, 16);
VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, c0, data0, 16);
VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, c1, data1, 16);
VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, state, data0, 16);
r = r0 + r1 + r2 + r3;
z = z0 + z1 + z2 + z3;
r = sigmoid(r);
z = sigmoid(z);
c = c2 * r + c3;
c = c0 + c1 * r + c;
c = tangentH(c);
state = z * (state - c) + c;
_viv_asm(COPY, s0, state, 16);
VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output, coord.xy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(hstate, coord.xy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}

View File

@ -3,6 +3,9 @@
#define logE (1.44269502f) #define logE (1.44269502f)
#define twoLogE (logE * 2.0f) #define twoLogE (logE * 2.0f)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
float4 sigmoid_func(float4 x) float4 sigmoid_func(float4 x)
{ {
x *= -logE; x *= -logE;
@ -98,3 +101,39 @@ GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8)
GRUCELL_QNT_F16TO_F16(U8, HSIGMOID, hard_sigmoid, vxc_uchar8) GRUCELL_QNT_F16TO_F16(U8, HSIGMOID, hard_sigmoid, vxc_uchar8)
GRUCELL_QNT_F16TO_F16(I8, HSIGMOID, hard_sigmoid, vxc_char8) GRUCELL_QNT_F16TO_F16(I8, HSIGMOID, hard_sigmoid, vxc_char8)
GRUCELL_QNT_F16TO_F16(I16, HSIGMOID, hard_sigmoid, vxc_short8) GRUCELL_QNT_F16TO_F16(I16, HSIGMOID, hard_sigmoid, vxc_short8)
#define GRUCELL_BF16(act_name, act_func) \
__kernel void grucell_h_times_activation_r_BF16_BF16toBF16_##act_name( \
__read_only image2d_t hstate_in, \
__read_only image2d_t input_r_conv, \
__read_only image2d_t hstate_r_conv, \
__write_only image2d_t output \
) \
{ \
int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
vxc_short8 v0, v1, v2, v3, data0, data1; \
float4 src0, src1, src2, src3; \
VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src0, data0, 16); \
VXC_DP2x8(data1, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src1, data1, 16); \
VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src3, data0, 16); \
\
float4 r; \
r = src0 + src1; \
r = act_func(r); \
float4 result = r * src3; \
_viv_asm(COPY, v0, result, 16); \
VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
GRUCELL_BF16(SIGMOID, sigmoid_func)
GRUCELL_BF16(HSIGMOID, hard_sigmoid)

View File

@ -3,6 +3,9 @@
#define logE (1.44269502f) #define logE (1.44269502f)
#define twoLogE (logE * 2.0f) #define twoLogE (logE * 2.0f)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
float4 sigmoid_func(float4 x) float4 sigmoid_func(float4 x)
{ {
x *= -logE; x *= -logE;
@ -150,3 +153,65 @@ GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID, tanh_func, sigmoid_func,
GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8) GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8)
GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_char8, vxc_char8) GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_char8, vxc_char8)
GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8) GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8)
#define GRUCELL_BF16(act_name, act_func, rec_act_name, rec_act_func) \
__kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_act_name( \
__read_only image2d_t hstate_in, \
__read_only image2d_t input_z_conv, \
__read_only image2d_t input_r_conv, \
__read_only image2d_t input_h_conv, \
__read_only image2d_t hstate_z_conv, \
__read_only image2d_t hstate_r_conv, \
__read_only image2d_t hstate_h_conv, \
__write_only image2d_t output, \
__write_only image2d_t hstate_out \
) \
{ \
int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
vxc_short8 v0, v1, v2, v3, v4, v5, v6, data0, data1; \
float4 src0, src1, src2, src3, src4, src5, src6; \
VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src0, data0, 16); \
VXC_DP2x8(data1, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src1, data1, 16); \
VXC_DP2x8(data0, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src2, data0, 16); \
VXC_DP2x8(data1, v4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src4, data1, 16); \
VXC_DP2x8(data0, v5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src5, data0, 16); \
VXC_DP2x8(data1, v6, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src6, data1, 16); \
VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src3, data0, 16); \
\
float4 r; \
r = src0 + src1; \
r = rec_act_func(r); \
float4 h = src4 + r * src2; \
float4 z = src5 + src6; \
h = act_func(h); \
z = rec_act_func(z); \
float4 result = (1 - z) * h + z * src3; \
_viv_asm(COPY, v0, result, 16); \
VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_WriteImage(hstate_out, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
GRUCELL_BF16(TANH, tanh_func, SIGMOID, sigmoid_func)
GRUCELL_BF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)

View File

@ -0,0 +1,124 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
#define LSTMUNIT_BP_BF16(act_name, act_func) \
__kernel void lstmunit_activation_BP_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_i_conv, \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_array_t hstate_i_conv, \
__read_only image2d_array_t hstate_f_conv, \
__read_only image2d_array_t hstate_c_conv, \
__read_only image2d_array_t hstate_o_conv, \
__read_only image2d_t bias_i, \
__read_only image2d_t bias_f, \
__read_only image2d_t bias_c, \
__read_only image2d_t bias_o, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
float4 src0, src1, src2, src3, src4; \
vxc_short8 vect10, vect11, vect12, vect13; \
float4 src10, src11, src12, src13; \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
float4 b0, b1, b2, b3; \
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
b0 = read_imagef(bias_i, coord_in.xw); \
b1 = read_imagef(bias_f, coord_in.xw); \
b2 = read_imagef(bias_c, coord_in.xw); \
b3 = read_imagef(bias_o, coord_in.xw); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src0, data0, 16); \
VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src10, data1, 16); \
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src1, data0, 16); \
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src11, data1, 16); \
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src2, data0, 16); \
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src12, data1, 16); \
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src3, data0, 16); \
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src13, data1, 16); \
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data0, 16); \
data_i_t = src0 + src10 + b0; \
data_f_t = src1 + src11 + b1; \
data_g_t = src2 + src12 + b2; \
data_o_t = src3 + src13 + b3; \
\
data_i_t = act_func(data_i_t); \
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_o_t = act_func(data_o_t); \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, vect0, data_c_t, 16); \
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, vect1, data_o_t, 16); \
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_BP_BF16(SIGMOID, sigmoid)
LSTMUNIT_BP_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,126 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
#define LSTMUNIT_B_BF16(act_name, act_func) \
__kernel void lstmunit_activation_B_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_i_conv, \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_array_t hstate_i_conv, \
__read_only image2d_array_t hstate_f_conv, \
__read_only image2d_array_t hstate_c_conv, \
__read_only image2d_array_t hstate_o_conv, \
__read_only image2d_t bias_i, \
__read_only image2d_t bias_f, \
__read_only image2d_t bias_c, \
__read_only image2d_t bias_o, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
__write_only image2d_t h_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
float4 src0, src1, src2, src3, src4; \
vxc_short8 vect10, vect11, vect12, vect13; \
float4 src10, src11, src12, src13; \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
float4 b0, b1, b2, b3; \
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
b0 = read_imagef(bias_i, coord_in.xw); \
b1 = read_imagef(bias_f, coord_in.xw); \
b2 = read_imagef(bias_c, coord_in.xw); \
b3 = read_imagef(bias_o, coord_in.xw); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src0, data0, 16); \
VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src10, data1, 16); \
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src1, data0, 16); \
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src11, data1, 16); \
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src2, data0, 16); \
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src12, data1, 16); \
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src3, data0, 16); \
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src13, data1, 16); \
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data0, 16); \
data_i_t = src0 + src10 + b0; \
data_f_t = src1 + src11 + b1; \
data_g_t = src2 + src12 + b2; \
data_o_t = src3 + src13 + b3; \
\
data_i_t = act_func(data_i_t); \
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_o_t = act_func(data_o_t); \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, vect0, data_c_t, 16); \
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, vect1, data_o_t, 16); \
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_B_BF16(SIGMOID, sigmoid)
LSTMUNIT_B_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,111 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
#define LSTMUNIT_CBP_BF16(act_name, act_func) \
__kernel void lstmunit_activation_CBP_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_array_t hstate_f_conv, \
__read_only image2d_array_t hstate_c_conv, \
__read_only image2d_array_t hstate_o_conv, \
__read_only image2d_t bias_f, \
__read_only image2d_t bias_c, \
__read_only image2d_t bias_o, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
float4 src0, src1, src2, src3, src4; \
vxc_short8 vect10, vect11, vect12, vect13; \
float4 src10, src11, src12, src13; \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
float4 b0, b1, b2, b3; \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
b1 = read_imagef(bias_f, coord_in.xw); \
b2 = read_imagef(bias_c, coord_in.xw); \
b3 = read_imagef(bias_o, coord_in.xw); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src1, data0, 16); \
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src11, data1, 16); \
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src2, data0, 16); \
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src12, data1, 16); \
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src3, data0, 16); \
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src13, data1, 16); \
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data0, 16); \
data_f_t = src1 + src11 + b1; \
data_g_t = src2 + src12 + b2; \
data_o_t = src3 + src13 + b3; \
\
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = 1.0 - data_f_t; \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_o_t = act_func(data_o_t); \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, vect0, data_c_t, 16); \
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, vect1, data_o_t, 16); \
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_CBP_BF16(SIGMOID, sigmoid)
LSTMUNIT_CBP_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,113 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
#define LSTMUNIT_CB_BF16(act_name, act_func) \
__kernel void lstmunit_activation_CB_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_array_t hstate_f_conv, \
__read_only image2d_array_t hstate_c_conv, \
__read_only image2d_array_t hstate_o_conv, \
__read_only image2d_t bias_f, \
__read_only image2d_t bias_c, \
__read_only image2d_t bias_o, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
__write_only image2d_t h_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
float4 src0, src1, src2, src3, src4; \
vxc_short8 vect10, vect11, vect12, vect13; \
float4 src10, src11, src12, src13; \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
float4 b0, b1, b2, b3; \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
b1 = read_imagef(bias_f, coord_in.xw); \
b2 = read_imagef(bias_c, coord_in.xw); \
b3 = read_imagef(bias_o, coord_in.xw); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src1, data0, 16); \
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src11, data1, 16); \
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src2, data0, 16); \
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src12, data1, 16); \
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src3, data0, 16); \
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src13, data1, 16); \
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data0, 16); \
data_f_t = src1 + src11 + b1; \
data_g_t = src2 + src12 + b2; \
data_o_t = src3 + src13 + b3; \
\
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = 1.0 - data_f_t; \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_o_t = act_func(data_o_t); \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, vect0, data_c_t, 16); \
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, vect1, data_o_t, 16); \
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_CB_BF16(SIGMOID, sigmoid)
LSTMUNIT_CB_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,101 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
_viv_uniform VXC_512Bits uniExtractHalf4_4x4;
#define LSTMUNIT_CLP_BF16(act_name, act_func) \
__kernel void lstmunit_activation_CLP_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_t bias_f, \
__read_only image2d_t bias_c, \
__read_only image2d_t bias_o, \
__read_only image2d_t layer_norm_wf, \
__read_only image2d_t layer_norm_wc, \
__read_only image2d_t layer_norm_wo, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
float4 w0, w1, w2, b0, b1, b2; \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
w0 = read_imagef(layer_norm_wf, coord_in.xw); \
w1 = read_imagef(layer_norm_wc, coord_in.xw); \
w2 = read_imagef(layer_norm_wo, coord_in.xw); \
b0 = read_imagef(bias_f, coord_in.xw); \
b1 = read_imagef(bias_c, coord_in.xw); \
b2 = read_imagef(bias_o, coord_in.xw); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_f_t, data0, 16); \
VXC_DP2x8(data1, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_g_t, data1, 16); \
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_o_t, data0, 16); \
VXC_DP2x8(data1, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data1, 16); \
\
data_f_t = data_f_t * w0 + b0; \
data_g_t = data_g_t * w1 + b1; \
data_o_t = data_o_t * w2 + b2; \
\
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = 1.0 - data_f_t; \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, vect0, data_c_t, 16); \
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_o_t = act_func(data_o_t); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, vect1, data_o_t, 16); \
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_CLP_BF16(SIGMOID, sigmoid)
LSTMUNIT_CLP_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,102 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
#define LSTMUNIT_CL_BF16(act_name, act_func) \
__kernel void lstmunit_activation_CL_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_t bias_f, \
__read_only image2d_t bias_c, \
__read_only image2d_t bias_o, \
__read_only image2d_t layer_norm_wf, \
__read_only image2d_t layer_norm_wc, \
__read_only image2d_t layer_norm_wo, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
__write_only image2d_t h_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
float4 w0, w1, w2, b0, b1, b2; \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
w0 = read_imagef(layer_norm_wf, coord_in.xw); \
w1 = read_imagef(layer_norm_wc, coord_in.xw); \
w2 = read_imagef(layer_norm_wo, coord_in.xw); \
b0 = read_imagef(bias_f, coord_in.xw); \
b1 = read_imagef(bias_c, coord_in.xw); \
b2 = read_imagef(bias_o, coord_in.xw); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_f_t, data0, 16); \
VXC_DP2x8(data1, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_g_t, data1, 16); \
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_o_t, data0, 16); \
VXC_DP2x8(data1, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data1, 16); \
\
data_f_t = data_f_t * w0 + b0; \
data_g_t = data_g_t * w1 + b1; \
data_o_t = data_o_t * w2 + b2; \
\
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = 1.0 - data_f_t; \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, vect0, data_c_t, 16); \
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_o_t = act_func(data_o_t); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, vect1, data_o_t, 16); \
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_CL_BF16(SIGMOID, sigmoid)
LSTMUNIT_CL_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,104 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
#define LSTMUNIT_CSP_BF16(act_name, act_func) \
__kernel void lstmunit_activation_CSP_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_array_t hstate_f_conv, \
__read_only image2d_array_t hstate_c_conv, \
__read_only image2d_array_t hstate_o_conv, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
float4 src0, src1, src2, src3, src4; \
vxc_short8 vect10, vect11, vect12, vect13; \
float4 src10, src11, src12, src13; \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src1, data0, 16); \
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src11, data1, 16); \
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src2, data0, 16); \
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src12, data1, 16); \
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src3, data0, 16); \
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src13, data1, 16); \
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data0, 16); \
data_f_t = src1 + src11; \
data_g_t = src2 + src12; \
data_o_t = src3 + src13; \
\
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = 1.0 - data_f_t; \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_o_t = act_func(data_o_t); \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, vect0, data_c_t, 16); \
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, vect1, data_o_t, 16); \
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_CSP_BF16(SIGMOID, sigmoid)
LSTMUNIT_CSP_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,106 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
#define LSTMUNIT_CS_BF16(act_name, act_func) \
__kernel void lstmunit_activation_CS_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_array_t hstate_f_conv, \
__read_only image2d_array_t hstate_c_conv, \
__read_only image2d_array_t hstate_o_conv, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
__write_only image2d_t h_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
float4 src0, src1, src2, src3, src4; \
vxc_short8 vect10, vect11, vect12, vect13; \
float4 src10, src11, src12, src13; \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src1, data0, 16); \
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src11, data1, 16); \
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src2, data0, 16); \
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src12, data1, 16); \
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src3, data0, 16); \
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src13, data1, 16); \
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data0, 16); \
data_f_t = src1 + src11; \
data_g_t = src2 + src12; \
data_o_t = src3 + src13; \
\
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = 1.0 - data_f_t; \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_o_t = act_func(data_o_t); \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, vect0, data_c_t, 16); \
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, vect1, data_o_t, 16); \
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_CS_BF16(SIGMOID, sigmoid)
LSTMUNIT_CS_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,110 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
#define LSTMUNIT_LP_BF16(act_name, act_func) \
__kernel void lstmunit_activation_LP_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_i_conv, \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_t bias_i, \
__read_only image2d_t bias_f, \
__read_only image2d_t bias_c, \
__read_only image2d_t bias_o, \
__read_only image2d_t layer_norm_wi, \
__read_only image2d_t layer_norm_wf, \
__read_only image2d_t layer_norm_wc, \
__read_only image2d_t layer_norm_wo, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
float4 w0, w1, w2, w3, b0, b1, b2, b3; \
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
w0 = read_imagef(layer_norm_wi, coord_in.xw); \
w1 = read_imagef(layer_norm_wf, coord_in.xw); \
w2 = read_imagef(layer_norm_wc, coord_in.xw); \
w3 = read_imagef(layer_norm_wo, coord_in.xw); \
b0 = read_imagef(bias_i, coord_in.xw); \
b1 = read_imagef(bias_f, coord_in.xw); \
b2 = read_imagef(bias_c, coord_in.xw); \
b3 = read_imagef(bias_o, coord_in.xw); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_i_t, data0, 16); \
VXC_DP2x8(data1, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_f_t, data1, 16); \
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_g_t, data0, 16); \
VXC_DP2x8(data1, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_o_t, data1, 16); \
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data0, 16); \
\
data_i_t = data_i_t * w0 + b0; \
data_f_t = data_f_t * w1 + b1; \
data_g_t = data_g_t * w2 + b2; \
data_o_t = data_o_t * w3 + b3; \
\
data_i_t = act_func(data_i_t); \
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, vect0, data_c_t, 16); \
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_o_t = act_func(data_o_t); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, vect1, data_o_t, 16); \
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_LP_BF16(SIGMOID, sigmoid)
LSTMUNIT_LP_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,112 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
#define LSTMUNIT_L_BF16(act_name, act_func) \
__kernel void lstmunit_activation_L_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_i_conv, \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_t bias_i, \
__read_only image2d_t bias_f, \
__read_only image2d_t bias_c, \
__read_only image2d_t bias_o, \
__read_only image2d_t layer_norm_wi, \
__read_only image2d_t layer_norm_wf, \
__read_only image2d_t layer_norm_wc, \
__read_only image2d_t layer_norm_wo, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
__write_only image2d_t h_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
float4 w0, w1, w2, w3, b0, b1, b2, b3; \
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
w0 = read_imagef(layer_norm_wi, coord_in.xw); \
w1 = read_imagef(layer_norm_wf, coord_in.xw); \
w2 = read_imagef(layer_norm_wc, coord_in.xw); \
w3 = read_imagef(layer_norm_wo, coord_in.xw); \
b0 = read_imagef(bias_i, coord_in.xw); \
b1 = read_imagef(bias_f, coord_in.xw); \
b2 = read_imagef(bias_c, coord_in.xw); \
b3 = read_imagef(bias_o, coord_in.xw); \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_i_t, data0, 16); \
VXC_DP2x8(data1, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_f_t, data1, 16); \
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_g_t, data0, 16); \
VXC_DP2x8(data1, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_o_t, data1, 16); \
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data0, 16); \
\
data_i_t = data_i_t * w0 + b0; \
data_f_t = data_f_t * w1 + b1; \
data_g_t = data_g_t * w2 + b2; \
data_o_t = data_o_t * w3 + b3; \
\
data_i_t = act_func(data_i_t); \
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, vect0, data_c_t, 16); \
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_o_t = act_func(data_o_t); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, vect1, data_o_t, 16); \
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_L_BF16(SIGMOID, sigmoid)
LSTMUNIT_L_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,117 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
#define LSTMUNIT_SP_BF16(act_name, act_func) \
__kernel void lstmunit_activation_SP_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_i_conv, \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_array_t hstate_i_conv, \
__read_only image2d_array_t hstate_f_conv, \
__read_only image2d_array_t hstate_c_conv, \
__read_only image2d_array_t hstate_o_conv, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1, data2, data3; \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
vxc_float4 src0, src1, src2, src3; \
vxc_short8 vect10, vect11, vect12, vect13; \
vxc_float4 src10, src11, src12, src13; \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src0, data0, 16); \
VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src10, data1, 16); \
VXC_DP2x8(data2, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src1, data2, 16); \
VXC_DP2x8(data3, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src11, data3, 16); \
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src2, data0, 16); \
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src12, data1, 16); \
VXC_DP2x8(data2, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src3, data2, 16); \
VXC_DP2x8(data3, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src13, data3, 16); \
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data0, 16); \
data_i_t = src0 + src10; \
data_f_t = src1 + src11; \
data_g_t = src2 + src12; \
data_o_t = src3 + src13; \
\
data_i_t = act_func(data_i_t); \
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_o_t = act_func(data_o_t); \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, data0, data_c_t, 16); \
VXC_DP2x8(data1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, data0, data_o_t, 16); \
VXC_DP2x8(data1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_SP_BF16(SIGMOID, sigmoid)
LSTMUNIT_SP_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,118 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float logE;
_viv_uniform float twoLogE;
_viv_uniform float forget_bias;
float4 sigmoid(float4 x)
{
x *= -logE;
x = 1 + exp2(x);
return 1 / x;
}
float4 hard_sigmoid(float4 x)
{
x = 0.2 * x + 0.5;
x = clamp(x, 0, 1);
return x;
}
float4 tangentH(float4 x)
{
x *= -twoLogE;
x = 1 + exp2(x);
x = 1 / x;
return 2 * x - 1;
}
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform float4 clip_Min_F;
_viv_uniform float4 clip_Max_F;
#define LSTMUNIT_S_BF16(act_name, act_func) \
__kernel void lstmunit_activation_S_BF16toBF16_BF16_##act_name( \
__read_only image2d_array_t input_i_conv, \
__read_only image2d_array_t input_f_conv, \
__read_only image2d_array_t input_c_conv, \
__read_only image2d_array_t input_o_conv, \
__read_only image2d_t cell_state_in, \
__read_only image2d_array_t hstate_i_conv, \
__read_only image2d_array_t hstate_f_conv, \
__read_only image2d_array_t hstate_c_conv, \
__read_only image2d_array_t hstate_o_conv, \
__write_only image2d_array_t output, \
__write_only image2d_t cell_state_out, \
__write_only image2d_t h_state_out, \
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
vxc_float4 src0, src1, src2, src3, src4; \
vxc_short8 vect10, vect11, vect12, vect13; \
vxc_float4 src10, src11, src12, src13; \
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src0, data0, 16); \
VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src10, data1, 16); \
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src1, data0, 16); \
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src11, data1, 16); \
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src2, data0, 16); \
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src12, data1, 16); \
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src3, data0, 16); \
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, src13, data1, 16); \
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniConvBF16toF32_Part0_2x8); \
_viv_asm(COPY, data_c_t, data0, 16); \
\
data_i_t = src0 + src10; \
data_f_t = src1 + src11; \
data_g_t = src2 + src12; \
data_o_t = src3 + src13; \
\
data_i_t = act_func(data_i_t); \
data_f_t = act_func(data_f_t + forget_bias); \
data_g_t = tangentH(data_g_t); \
data_i_t = data_i_t * data_g_t; \
data_c_t = data_c_t * data_f_t + data_i_t; \
data_o_t = act_func(data_o_t); \
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
_viv_asm(COPY, vect0, data_c_t, 16); \
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
data_c_t = tangentH(data_c_t); \
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
_viv_asm(COPY, vect1, data_o_t, 16); \
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
LSTMUNIT_S_BF16(SIGMOID, sigmoid)
LSTMUNIT_S_BF16(HARD_SIGMOID, hard_sigmoid)

View File

@ -0,0 +1,283 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform VXC_512Bits uniConvF16toFp32_4x4;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
_viv_uniform float inout_scale;
_viv_uniform float inout_tail;
_viv_uniform int width;
_viv_uniform int height;
#define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val) \
__kernel void maxpool_##in_name##to##out_name( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int stride_x, int stride_y, int pad_x, int pad_y, \
int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \
int4 coord_in = coord_out; \
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \
for(; pos_start.x < 0;) \
{ \
pos_start.x += dilation_x; \
} \
for(; pos_start.y < 0;) \
{ \
pos_start.y += dilation_y; \
} \
pos_end = min(pos_end, (int2)(width, height)); \
\
src_type src0; \
dst_type maxVal; \
maxVal.x = max_val; \
\
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr_a); \
\
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \
{ \
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \
{ \
VXC_OP4(img_load_3d, src0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
coord_in.x += dilation_x; \
VXC_VertMax3_Integer(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
} \
} \
\
float4 fValTmp; \
fValTmp.x = maxVal.x * inout_scale + inout_tail; \
int4 i4Val = convert_int4_rte(fValTmp); \
VXC_DP2x8(maxVal, i4Val, i4Val, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), \
uniConvertInt32toUint8_2x8); \
VXC_WriteImage2DArray(output, coord_out, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
}
MAXPOOL_QINT(U8, U8, vxc_uchar8, vxc_uchar8, 0)
MAXPOOL_QINT(I8, I8, vxc_char8, vxc_char8, -128)
MAXPOOL_QINT(I16, I16, vxc_short8, vxc_short8, -32768)
__kernel void maxpool_F16toF16(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int stride_x, int stride_y, int pad_x, int pad_y,
int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int4 coord_out = (int4)(gidx, gidy, gidz, 0);
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);
int4 coord_in = coord_out;
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);
for(; pos_start.x < 0;)
{
pos_start.x += dilation_x;
}
for(; pos_start.y < 0;)
{
pos_start.y += dilation_y;
}
pos_end = min(pos_end, (int2)(width, height));
vxc_short8 data0;
vxc_half8 maxVal, src0;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr_a);
coord_in.xy = pos_start;
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, maxVal, data0, 16);
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)
{
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)
{
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x += dilation_x;
_viv_asm(COPY, src0, data0, 16);
VXC_VertMax3_Half(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
}
_viv_asm(COPY, data0, maxVal, 16);
VXC_WriteImage2DArray(output, coord_out, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
#define MAXPOOL_F16_TO_QINT(out_name, dst_type) \
__kernel void maxpool_F16to##out_name( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int stride_x, int stride_y, int pad_x, int pad_y, \
int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \
int4 coord_in = coord_out; \
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \
for(; pos_start.x < 0;) \
{ \
pos_start.x += dilation_x; \
} \
for(; pos_start.y < 0;) \
{ \
pos_start.y += dilation_y; \
} \
pos_end = min(pos_end, (int2)(width, height)); \
\
vxc_short8 data0; \
vxc_half8 maxVal, src0; \
\
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr_a); \
coord_in.xy = pos_start; \
\
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, maxVal, data0, 16); \
\
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \
{ \
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \
{ \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
coord_in.x += dilation_x; \
_viv_asm(COPY, src0, data0, 16); \
VXC_VertMax3_Half(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
} \
} \
float4 fValTmp; \
VXC_DP4x4(fValTmp, maxVal, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvF16toFp32_4x4); \
fValTmp.x = fValTmp.x * inout_scale + inout_tail; \
int4 i4Val = convert_int4_rte(fValTmp); \
dst_type dst; \
VXC_DP2x8(dst, i4Val, i4Val, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), \
uniConvertInt32toUint8_2x8); \
VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
}
MAXPOOL_F16_TO_QINT(U8, vxc_uchar8)
MAXPOOL_F16_TO_QINT(I8, vxc_char8)
MAXPOOL_F16_TO_QINT(I16, vxc_short8)
#define MAXPOOL_QINT_TO_F16(in_name, src_type, max_val) \
__kernel void maxpool_##in_name##toF16( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int stride_x, int stride_y, int pad_x, int pad_y, \
int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \
int4 coord_in = coord_out; \
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \
for(; pos_start.x < 0;) \
{ \
pos_start.x += dilation_x; \
} \
for(; pos_start.y < 0;) \
{ \
pos_start.y += dilation_y; \
} \
pos_end = min(pos_end, (int2)(width, height)); \
\
src_type src0, maxVal; \
maxVal.x = max_val; \
\
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr_a); \
\
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \
{ \
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \
{ \
VXC_OP4(img_load_3d, src0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
coord_in.x += dilation_x; \
VXC_VertMax3_Integer(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
} \
} \
\
float4 fValTmp; \
fValTmp.x = maxVal.x * inout_scale + inout_tail; \
half4 h4Val; \
_viv_asm(CONV, h4Val, fValTmp); \
vxc_short8 dst; \
_viv_asm(COPY, dst, h4Val, 4); \
VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
}
MAXPOOL_QINT_TO_F16(U8, vxc_uchar8, 0)
MAXPOOL_QINT_TO_F16(I8, vxc_char8, -128)
MAXPOOL_QINT_TO_F16(I16, vxc_short8, -32768)
__kernel void maxpool_BF16toBF16(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int stride_x, int stride_y, int pad_x, int pad_y,
int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int4 coord_out = (int4)(gidx, gidy, gidz, 0);
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);
int4 coord_in = coord_out;
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);
for(; pos_start.x < 0;)
{
pos_start.x += dilation_x;
}
for(; pos_start.y < 0;)
{
pos_start.y += dilation_y;
}
pos_end = min(pos_end, (int2)(width, height));
vxc_short8 data0, val0;
float4 maxVal, src0;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr_a);
coord_in.xy = pos_start;
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
VXC_DP2x8(val0, data0, zero, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, maxVal, val0, 4);
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)
{
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)
{
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x += dilation_x;
VXC_DP2x8(val0, data0, zero, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src0, val0, 4);
maxVal = max(src0, maxVal);
}
}
_viv_asm(COPY, data0, maxVal, 16);
VXC_DP2x8(val0, data0, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage2DArray(output, coord_out, val0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}

View File

@ -17,6 +17,8 @@ _viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniExtract8Data_2x8;
_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8; _viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
_viv_uniform VXC_512Bits uniExtractYtoShortSub16_2x8;
_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;
#define NV12_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ #define NV12_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
__kernel void pre_process_nv12_copy_##name \ __kernel void pre_process_nv12_copy_##name \
@ -57,14 +59,24 @@ __kernel void pre_process_nv12_copy_##name \
UV.s0123 = UV.s1032; \ UV.s0123 = UV.s1032; \
} \ } \
\ \
vxc_short8 tmpY; \
vxc_char16 tmpUV; \ vxc_char16 tmpUV; \
short tmpVal = 128; \ short tmpVal = 16; \
VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYtoShortSub16_2x8); \
tmpVal = 128; \
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \ VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \
\ \
float4 tmpDstB, tmpDstG, tmpDstR; \ float4 tmpDstB, tmpDstG, tmpDstR; \
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \ vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \ VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \ VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \
VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \
VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
\ \
conv_type result; \ conv_type result; \
dst_type dst0; \ dst_type dst0; \

View File

@ -22,9 +22,11 @@ _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
_viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniExtract8Data_2x8;
_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8; _viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
_viv_uniform VXC_512Bits uniConvertYtoShortSub16_2x8;
_viv_uniform VXC_512Bits uniCalculateYShift_2x8; _viv_uniform VXC_512Bits uniCalculateYShift_2x8;
_viv_uniform VXC_512Bits uniCalculateUVShift_2x8; _viv_uniform VXC_512Bits uniCalculateUVShift_2x8;
_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;
#define NV12_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ #define NV12_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
__kernel void pre_process_nv12_scale_##name##_gq \ __kernel void pre_process_nv12_scale_##name##_gq \
@ -85,14 +87,24 @@ __kernel void pre_process_nv12_scale_##name##_gq \
VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\ \
vxc_short8 tmpY; \
vxc_char16 tmpUV; \ vxc_char16 tmpUV; \
short tmpVal = 128; \ short tmpVal = 16; \
VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \
tmpVal = 128; \
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \ VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \
\ \
float4 tmpDstB, tmpDstG, tmpDstR; \ float4 tmpDstB, tmpDstG, tmpDstR; \
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \ vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \ VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \ VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \
VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \
VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
\ \
conv_type result; \ conv_type result; \
dst_type dst0; \ dst_type dst0; \
@ -181,14 +193,24 @@ __kernel void pre_process_nv12_scale_##name \
UV.s01234567 = UV.s10325476; \ UV.s01234567 = UV.s10325476; \
} \ } \
\ \
vxc_short8 tmpY; \
vxc_char16 tmpUV; \ vxc_char16 tmpUV; \
short tmpVal = 128; \ short tmpVal = 16; \
VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \
tmpVal = 128; \
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \ VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \
\ \
float4 tmpDstB, tmpDstG, tmpDstR; \ float4 tmpDstB, tmpDstG, tmpDstR; \
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \ vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \ VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \ VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \
VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \
VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
\ \
conv_type result; \ conv_type result; \
dst_type dst0; \ dst_type dst0; \

View File

@ -118,7 +118,7 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\ \
int4 coord_out = coord; \ int4 coord_out = coord.wwzw; \
coord_out.xyw += rgb_order.xyz; \ coord_out.xyw += rgb_order.xyz; \
float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \
rMean * r_scale * output_scale - output_zp, \ rMean * r_scale * output_scale - output_zp, \

View File

@ -16,6 +16,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniExtract8Data_2x8;
_viv_uniform VXC_512Bits uniExtractYUVtoShortSub_2x8; _viv_uniform VXC_512Bits uniExtractYUVtoShortSub_2x8;
_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;
#define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ #define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
__kernel void pre_process_yuv422_copy_##name \ __kernel void pre_process_yuv422_copy_##name \
@ -54,11 +55,21 @@ __kernel void pre_process_yuv422_copy_##name \
} \ } \
\ \
float4 tmpDstB, tmpDstG, tmpDstR; \ float4 tmpDstB, tmpDstG, tmpDstR; \
vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
vxc_short2 value = (vxc_short2)(128,16); \ vxc_short2 value = (vxc_short2)(128,16); \
VXC_DP2x8(tmpYUV, YUV, value, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYUVtoShortSub_2x8); \ VXC_DP2x8(tmpYUV, YUV, value, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYUVtoShortSub_2x8); \
VXC_DP4x4(tmpDstB, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \ VXC_DP4x4(DstB_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
VXC_DP4x4(tmpDstG, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \ uniConvertYUV422toB_4x4); \
VXC_DP4x4(tmpDstR, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \ VXC_DP4x4(DstG_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
uniConvertYUV422toG_4x4); \
VXC_DP4x4(DstR_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
uniConvertYUV422toR_4x4); \
VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
\ \
conv_type result; \ conv_type result; \
dst_type dst0; \ dst_type dst0; \

View File

@ -21,6 +21,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniExtract8Data_2x8;
_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8; _viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
_viv_uniform VXC_512Bits uniExtractYtoShortSub16_4x4; _viv_uniform VXC_512Bits uniExtractYtoShortSub16_4x4;
_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;
#define uyvy422 1 #define uyvy422 1
@ -70,8 +71,8 @@ __kernel void pre_process_yuv422_scale_##name \
} \ } \
\ \
int4 coord_Y = (int4)(sx.x + y_offset, sy, 0, 0); \ int4 coord_Y = (int4)(sx.x + y_offset, sy, 0, 0); \
int4 coord_U = (int4)((sx.x >> 1) * 2 + u_offset, sy, 0, 0); \ int4 coord_U = (int4)((sx.x >> 2) * 4 + u_offset, sy, 0, 0); \
int4 coord_V = (int4)((sx.x >> 1) * 2 + v_offset, sy, 0, 0); \ int4 coord_V = (int4)((sx.x >> 2) * 4 + v_offset, sy, 0, 0); \
\ \
VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
coord_Y.x = sx.y + y_offset; \ coord_Y.x = sx.y + y_offset; \
@ -81,7 +82,7 @@ __kernel void pre_process_yuv422_scale_##name \
coord_Y.x = sx.w + y_offset; \ coord_Y.x = sx.w + y_offset; \
VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
\ \
sx = (sx >> 1) * 2 + u_offset; \ sx = (sx >> 2) * 4 + u_offset; \
VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
coord_U.x = sx.y; \ coord_U.x = sx.y; \
VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
@ -105,9 +106,19 @@ __kernel void pre_process_yuv422_scale_##name \
VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
\ \
float4 tmpDstB, tmpDstG, tmpDstR; \ float4 tmpDstB, tmpDstG, tmpDstR; \
VXC_DP4x4(tmpDstB, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \ vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
VXC_DP4x4(tmpDstG, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \ VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
VXC_DP4x4(tmpDstR, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \ uniConvertYUV422toB_4x4); \
VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
uniConvertYUV422toG_4x4); \
VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
uniConvertYUV422toR_4x4); \
VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
\ \
conv_type result; \ conv_type result; \
dst_type dst0; \ dst_type dst0; \

View File

@ -21,8 +21,8 @@ __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
src0_type src0; \ src0_type src0; \
src0_copy_type srcA; \ src0_copy_type srcA; \
src0_type src1; \ src1_type src1; \
src0_copy_type srcB; \ src1_copy_type srcB; \
VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, srcA, src0, 16); \ _viv_asm(COPY, srcA, src0, 16); \
VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \

View File

@ -21,8 +21,8 @@ __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8( \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
src0_type src0; \ src0_type src0; \
src0_copy_type srcA; \ src0_copy_type srcA; \
src0_type src1; \ src1_type src1; \
src0_copy_type srcB; \ src1_copy_type srcB; \
VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, srcA, src0, 16); \ _viv_asm(COPY, srcA, src0, 16); \
VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \

View File

@ -28,17 +28,19 @@ __kernel void scatter_nd_update_reset_##name0##to##name1( \
Image img1 = create_image_from_image2d(input_ref, size0); \ Image img1 = create_image_from_image2d(input_ref, size0); \
Image img2 = create_image_from_image2d(temp_ref, size1); \ Image img2 = create_image_from_image2d(temp_ref, size1); \
Image img3 = create_image_from_image2d(temp_buf_int, 4); \ Image img3 = create_image_from_image2d(temp_buf_int, 4); \
__global int* tmp_update_ptr = (__global int*)img3.ptr; \
type0 src; \
type1 tmpDst; \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
if(length > 0) \
{ \
__global ptr0* input_ptr = (__global ptr0*)img1.ptr; \ __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \
__global ptr1* output_ptr = (__global ptr1*)img2.ptr; \ __global ptr1* output_ptr = (__global ptr1*)img2.ptr; \
__global int* tmp_update_ptr = (__global int*)img3.ptr; \
ptr0 tmpData = input_ptr[gidx]; \ ptr0 tmpData = input_ptr[gidx]; \
int4 zeros = (int4)(0); \ int4 zeros = (int4)(0); \
int loc2 = gidx * 8; \ int loc2 = gidx * 8; \
type0 src; \
type1 tmpDst; \
ptr1 dst; \ ptr1 dst; \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
_viv_asm(COPY, src, tmpData, len0); \ _viv_asm(COPY, src, tmpData, len0); \
VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \ uniU8MulAndPostShift_0_Lo_2x8); \
@ -46,19 +48,20 @@ __kernel void scatter_nd_update_reset_##name0##to##name1( \
output_ptr[gidx] = dst; \ output_ptr[gidx] = dst; \
vstore4(zeros, 0, tmp_update_ptr + loc2); \ vstore4(zeros, 0, tmp_update_ptr + loc2); \
vstore4(zeros, 1, tmp_update_ptr + loc2); \ vstore4(zeros, 1, tmp_update_ptr + loc2); \
if(gidx < res) \ } \
{ \
__global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \ __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \
__global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \ __global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \
ptr2 tmpData1 = input_ptr1[length + gidx]; \ for(int i = gidx; i < res; i += get_global_size(0)) \
{ \
ptr2 tmpData1 = input_ptr1[length + i]; \
ptr3 dst1; \ ptr3 dst1; \
dst1 ^= dst1; \ dst1 ^= dst1; \
tmp_update_ptr[length + gidx] = 0; \ tmp_update_ptr[length + i] = 0; \
_viv_asm(COPY, src, tmpData1, 4); \ _viv_asm(COPY, src, tmpData1, 4); \
VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
uniU8MulAndPostShift_0_Lo_2x8); \ uniU8MulAndPostShift_0_Lo_2x8); \
_viv_asm(COPY, dst1, tmpDst, len3); \ _viv_asm(COPY, dst1, tmpDst, len3); \
output_ptr1[length + gidx] = dst1; \ output_ptr1[length + i] = dst1; \
} \ } \
} }
SCATTER_RESET(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, 8, 8, 1, 1, uchar, uchar, 1) SCATTER_RESET(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, 8, 8, 1, 1, uchar, uchar, 1)
@ -246,14 +249,17 @@ __kernel void scatter_nd_update_copy_##src0_type( \
int gidx = get_global_id(0); \ int gidx = get_global_id(0); \
Image img1 = create_image_from_image2d(temp_ref, element_size); \ Image img1 = create_image_from_image2d(temp_ref, element_size); \
Image img2 = create_image_from_image2d(output, element_size); \ Image img2 = create_image_from_image2d(output, element_size); \
if(length > 0) \
{ \
__global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \ __global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \
__global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \ __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \
output_ptr[gidx] = input_ptr[gidx]; \ output_ptr[gidx] = input_ptr[gidx]; \
if(gidx < res) \ } \
{ \
__global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \ __global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \
__global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \ __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \
output_ptr1[length + gidx] = input_ptr1[length + gidx]; \ for(int i = gidx; i < res; i += get_global_size(0)) \
{ \
output_ptr1[length + i] = input_ptr1[length + i]; \
} \ } \
} }
SCATTER_ND_UPDATE_COPY(U8, vxc_uchar8, 1, uchar) SCATTER_ND_UPDATE_COPY(U8, vxc_uchar8, 1, uchar)

File diff suppressed because it is too large Load Diff

View File

@ -1,261 +0,0 @@
# to make ovxlib can compile both IDE and SKD
# if you want to use IDE to compile : export USE_IDE_LIB=1
# and VIVANTE_SDK_DIR=..../VeriSilicon/VivanteIDE5.4.0/cmdtools/vsimulator
###################################################################################
#common parts
# OBJECTS.
OBJECTS = $(OBJ_DIR)/vsi_nn_context.o \
$(OBJ_DIR)/vsi_nn_client_op.o \
$(OBJ_DIR)/vsi_nn_graph.o \
$(OBJ_DIR)/vsi_nn_node_attr_template.o \
$(OBJ_DIR)/vsi_nn_node.o \
$(OBJ_DIR)/vsi_nn_ops.o \
$(OBJ_DIR)/vsi_nn_daemon.o \
$(OBJ_DIR)/vsi_nn_tensor.o \
$(OBJ_DIR)/vsi_nn_version.o \
$(OBJ_DIR)/vsi_nn_rnn.o \
$(OBJ_DIR)/vsi_nn_rnn_helper.o \
$(OBJ_DIR)/vsi_nn_internal_node.o \
$(OBJ_DIR)/vsi_nn_log.o \
$(OBJ_DIR)/vsi_nn_graph_optimization.o \
$(OBJ_DIR)/vsi_nn_pre_post_process.o
vpath %.c utils
OBJECTS += $(OBJ_DIR)/vsi_nn_code_generator.o \
$(OBJ_DIR)/vsi_nn_binary_tree.o \
$(OBJ_DIR)/vsi_nn_map.o \
$(OBJ_DIR)/vsi_nn_link_list.o \
$(OBJ_DIR)/vsi_nn_math.o \
$(OBJ_DIR)/vsi_nn_dtype_util.o \
$(OBJ_DIR)/vsi_nn_shape_util.o \
$(OBJ_DIR)/vsi_nn_dtype.o \
$(OBJ_DIR)/vsi_nn_limits.o \
$(OBJ_DIR)/vsi_nn_util.o \
$(OBJ_DIR)/vsi_nn_dlfcn.o \
$(OBJ_DIR)/vsi_nn_constraint_check.o \
$(OBJ_DIR)/vsi_nn_hashmap.o \
$(OBJ_DIR)/vsi_nn_tensor_op.o
vpath %.c quantization
OBJECTS += $(OBJ_DIR)/vsi_nn_dynamic_fixed_point.o \
$(OBJ_DIR)/vsi_nn_asymmetric_affine.o \
$(OBJ_DIR)/vsi_nn_perchannel_symmetric_affine.o
vpath %.c post
OBJECTS += $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \
$(OBJ_DIR)/vsi_nn_post_cmupose.o
vpath %.c libnnext
OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o \
$(OBJ_DIR)/vsi_nn_vxkernel.o
vpath %.c cpu_backend
SRCS += ${notdir ${wildcard cpu_backend/*.c}}
vpath %.c libnnext/ops/kernel
SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}}
vpath %.c ops
SRCS += ${notdir ${wildcard ops/*.c}}
vpath %.c kernel
SRCS += ${notdir ${wildcard kernel/*.c}}
vpath %.c kernel/cl
SRCS += ${notdir ${wildcard kernel/cl/*.c}}
vpath %.c kernel/cpu
SRCS += ${notdir ${wildcard kernel/cpu/*.c}}
vpath %.c kernel/evis
SRCS += ${notdir ${wildcard kernel/evis/*.c}}
vpath %.c kernel/vx
SRCS += ${notdir ${wildcard kernel/vx/*.c}}
vpath %.c kernel/sp
SRCS += ${notdir ${wildcard kernel/sp/*.c}}
vpath %.c custom/ops
SRCS += ${notdir ${wildcard custom/ops/*.c}}
vpath %.c custom/ops/kernel/evis
SRCS += ${notdir ${wildcard custom/ops/kernel/evis/*.c}}
vpath %.c custom/ops/kernel/cl
SRCS += ${notdir ${wildcard custom/ops/kernel/cl/*.c}}
vpath %.c custom/ops/kernel/cpu
SRCS += ${notdir ${wildcard custom/ops/kernel/cpu/*.c}}
vpath %.c custom/ops/kernel/sp
SRCS += ${notdir ${wildcard custom/ops/kernel/sp/*.c}}
OBJECTS += ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)}
ifeq ($(USE_VIP_DEVICE),1)
vpath %.cpp vip
OBJECTS += $(OBJ_DIR)/virtual_device.o
endif
################################################################################
ifeq ($(USE_IDE_LIB),1)
# IDE.
CC=$(CROSS_COMPILE)gcc
INCLUDES=-I. -I$(VIVANTE_SDK_DIR)/include/ \
-I$(VIVANTE_SDK_DIR)/include/CL \
-I$(VIVANTE_SDK_DIR)/include/VX \
-I../include/ops -I../include/utils -I../include/inference \
-I../include/client -I../include -I../include/libnnext \
-I../include/cpu_backend \
-I../src
ifeq (1,$(DEBUG))
CFLAGS+=-g
LFLAGS+=-g
else
CFLAGS+=-O3
LFLAGS+=-O3
endif
CFLAGS += $(INCLUDES)
CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror -Wno-strict-aliasing -Wno-maybe-uninitialized
CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")))'
LIBS+= -L$(VIVANTE_SDK_DIR)/lib \
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy -lArchModelSw -lNNArchPerf
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/vsim \
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux \
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
LIBS+= -L$(VIVANTE_SDK_DIR)/../common/lib/ \
-lvdtproxy
LIBS += -lm -ldl
File = $(VIVANTE_SDK_DIR)/lib/libjpeg.a
File2 = $(VIVANTE_SDK_DIR)/lib/x64_linux/libjpeg.a
File3 = $(VIVANTE_SDK_DIR)/../common/lib/libjpeg.a
ifeq ($(File),$(wildcard $(File)))
LIBS+= $(File)
else ifeq ($(File2),$(wildcard $(File2)))
LIBS+= $(File2)
else
LIBS+= $(File3)
endif
###################################################################################
# Macros.
CFLAGS += -fPIC
DYNAMIC := 1
TARGET_NAME = libovxlib.so
OBJ_DIR=bin_r
TARGET_OUTPUT = $(OBJ_DIR)/$(TARGET_NAME)
all: $(TARGET_OUTPUT)
clean:
@rm -rf $(OBJ_DIR)/* $(OBJ_DIR)
install: $(TARGET_OUTPUT)
################################################################################
LDFLAGS += -Wall -shared -Wl,-soname,$(TARGET_NAME) -Wl,-z,defs -fPIC
ifeq ($(USE_VIP_DEVICE),1)
LDFLAGS += -pthread
LIBS += -lstdc++
INCLUDE += -I../include/vip
$(OBJ_DIR)/virtual_device.o: virtual_device.cpp
@echo " COMPILE $(abspath $<)"
@mkdir -p $(OBJ_DIR)
@$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $<
endif
$(TARGET_OUTPUT): $(OBJECTS)
@echo " LINK \033[1m$(notdir $@)\033[0m"
@$(CC) $(LDFLAGS) $(OBJECTS) -o $(TARGET_OUTPUT) $(LIBS)
$(OBJ_DIR)/%.o: %.c
@echo " COMPILE $(abspath $<)"
@mkdir -p $(OBJ_DIR)
@$(CC) -c $(CFLAGS) -o $@ $<
else
##################################################################################
#SDK.
# include common definition.
include $(AQROOT)/makefile.linux.def
#################################################################################
INCLUDE += -I$(VIVANTE_SDK_INC) -I$(VIVANTE_SDK_INC)/HAL -I$(AQROOT)/sdk/inc
INCLUDE += -I../include/ops -I../include/utils -I../include/inference
INCLUDE += -I../include/client -I../include -I../include/libnnext
INCLUDE += -I../include/cpu_backend
INCLUDE += -I../src
CFLAGS += $(INCLUDE)
CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror
CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")))'
################################################################################
# Supply necessary libraries.
ifeq ($(USE_VXC_BINARY)$(USE_VSC_LITE),11)
LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC_Lite -lGAL
else
LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC -lGAL
endif
LIBS += -lm -ldl
#############################################################################
# Macros.
ifeq ($(gcdSTATIC_LINK), 1)
STATIC=1
TARGET_NAME = libovxlib.a
else
CFLAGS += -fPIC
DYNAMIC := 1
TARGET_NAME = libovxlib.so
endif
ifneq ("$(OVXLIB_CONFIG)", "")
CFLAGS += -D$(OVXLIB_CONFIG)
endif
ifneq ($(gcdSTATIC_LINK), 1)
ifeq ($(VSI_GPERF_DEBUG), 1)
TCMALLOC_DIR = $(OVXLIB_DIR)/third-party/gperftools
CFLAGS += -I$(TCMALLOC_DIR)/src
CFLAGS += -I$(TCMALLOC_DIR)/src/gperftools
CFLAGS += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
CFLAGS += -g
LIBS += -L$(TCMALLOC_DIR)/.libs -ltcmalloc
endif
endif
#############################################################################
# installation directory
INSTALL_DIR := $(VIVANTE_SDK_LIB)
################################################################################
# Include the common makefile.
ifeq ($(USE_VIP_DEVICE),1)
LDFLAGS += -pthread
LIBS += -lstdc++
INCLUDE += -I../include/vip
$(OBJ_DIR)/virtual_device.o: virtual_device.cpp
@echo " COMPILE $(abspath $<)"
@mkdir -p $(OBJ_DIR)
@$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $<
endif
include $(AQROOT)/common.target
endif

View File

@ -234,6 +234,7 @@ static vsi_bool op_check
IO_TYPE(D_F32, D_I16) IO_TYPE(D_F32, D_I16)
IO_TYPE(D_F16, D_I32) IO_TYPE(D_F16, D_I32)
IO_TYPE(D_I32, D_I32) IO_TYPE(D_I32, D_I32)
IO_TYPE(D_I32, D_I16)
IO_TYPE(D_I8|Q_DFP, D_I32) IO_TYPE(D_I8|Q_DFP, D_I32)
IO_TYPE(D_U8|Q_ASYM, D_I32) IO_TYPE(D_U8|Q_ASYM, D_I32)
IO_TYPE(D_I8|Q_ASYM, D_U8) IO_TYPE(D_I8|Q_ASYM, D_U8)

View File

@ -299,7 +299,7 @@ static vsi_bool op_setup
} }
ret = vsi_nn_op_common_setup(self, inputs, outputs); ret = vsi_nn_op_common_setup(self, inputs, outputs);
if ( _is_dataconvert_op(self, inputs, outputs) ) if ( _is_dataconvert_op(self, inputs, outputs) && ret )
{ {
vsi_nn_internal_node_t* curr = NULL; vsi_nn_internal_node_t* curr = NULL;
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);

View File

@ -128,6 +128,9 @@ static vsi_bool _is_tensorview_support
#ifdef VSI_CONCAT_ENHANCE_SUPPORT #ifdef VSI_CONCAT_ENHANCE_SUPPORT
// Driver support concat optimize in all dimensions. // Driver support concat optimize in all dimensions.
ret = TRUE; ret = TRUE;
VSI_UNREFERENCED(self);
VSI_UNREFERENCED(outputs);
#else #else
/* /*
If the concat op need to be optimized to tensor view, the memory must be continues. If the concat op need to be optimized to tensor view, the memory must be continues.

View File

@ -24,6 +24,7 @@
#include <string.h> #include <string.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h" #include "vsi_nn_platform.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_node.h" #include "vsi_nn_node.h"
@ -216,9 +217,12 @@ static vsi_bool op_setup
if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt && if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt &&
VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type ) VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type )
{ {
vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL ); if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed)
{
vsi_nn_TransposeTensor(self->graph, inputs[1], perm, 4, NULL);
inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW;
} }
}
#ifdef VX_CONVERT_POLICY_WRAP_ENABLE #ifdef VX_CONVERT_POLICY_WRAP_ENABLE
if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 )
@ -227,6 +231,8 @@ static vsi_bool op_setup
} }
#endif #endif
((vsi_nn_tensor_prv_t*)inputs[1])->processed = TRUE;
nn_param = &self->nn_param.conv2d; nn_param = &self->nn_param.conv2d;
vsi_nn_compute_padding( vsi_nn_compute_padding(

View File

@ -248,6 +248,7 @@ static vsi_bool op_check
IO_TYPE(D_BOOL8, D_I16|Q_DFP) IO_TYPE(D_BOOL8, D_I16|Q_DFP)
IO_TYPE(D_BOOL8, D_I16|Q_ASYM) IO_TYPE(D_BOOL8, D_I16|Q_ASYM)
IO_TYPE(D_BOOL8, D_I16|Q_SYM) IO_TYPE(D_BOOL8, D_I16|Q_SYM)
IO_TYPE(D_BOOL8, D_F16)
IO_TYPE(D_BOOL8, D_I32) IO_TYPE(D_BOOL8, D_I32)
IO_TYPE(D_BOOL8, D_U16) IO_TYPE(D_BOOL8, D_U16)
IO_TYPE(D_BOOL8, D_U32) IO_TYPE(D_BOOL8, D_U32)
@ -258,6 +259,7 @@ static vsi_bool op_check
IO_TYPE(D_I16|Q_DFP, D_BOOL8) IO_TYPE(D_I16|Q_DFP, D_BOOL8)
IO_TYPE(D_I16|Q_ASYM, D_BOOL8) IO_TYPE(D_I16|Q_ASYM, D_BOOL8)
IO_TYPE(D_I16|Q_SYM, D_BOOL8) IO_TYPE(D_I16|Q_SYM, D_BOOL8)
IO_TYPE(D_F16, D_BOOL8)
IO_TYPE(D_I32, D_BOOL8) IO_TYPE(D_I32, D_BOOL8)
IO_TYPE(D_U16, D_BOOL8) IO_TYPE(D_U16, D_BOOL8)
IO_TYPE(D_U32, D_BOOL8) IO_TYPE(D_U32, D_BOOL8)

View File

@ -25,6 +25,7 @@
#include <stdlib.h> #include <stdlib.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h" #include "vsi_nn_platform.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_node.h" #include "vsi_nn_node.h"
@ -410,9 +411,12 @@ static vsi_bool op_setup
* */ * */
if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt ) if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt )
{ {
vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL ); if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed)
{
vsi_nn_TransposeTensor(self->graph, inputs[1], perm, 4, NULL);
inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW;
} }
}
#ifdef VX_CONVERT_POLICY_WRAP_ENABLE #ifdef VX_CONVERT_POLICY_WRAP_ENABLE
if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 )
@ -424,22 +428,30 @@ static vsi_bool op_setup
#ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS #ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS
if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 && TRUE == inputs[1]->attr.is_const) if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 && TRUE == inputs[1]->attr.is_const)
{ {
if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed) {
/* whnc->whcn */ /* whnc->whcn */
vsi_nn_PermuteTensor( self->graph, inputs[1], perm1, 4 ); vsi_nn_PermuteTensor(self->graph, inputs[1], perm1, 4);
}
} }
/* Rotate 180 degrees for weights data */ /* Rotate 180 degrees for weights data */
if (TRUE == inputs[1]->attr.is_const) if (TRUE == inputs[1]->attr.is_const)
{ {
if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed) {
vsi_nn_reshuffle_weight_data(self->graph, inputs[1]); vsi_nn_reshuffle_weight_data(self->graph, inputs[1]);
} }
}
#else #else
if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) >= 0 && TRUE == inputs[1]->attr.is_const) if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) >= 0 && TRUE == inputs[1]->attr.is_const)
{ {
/* whcn->whnc */ /* whcn->whnc */
vsi_nn_PermuteTensor( self->graph, inputs[1], perm1, 4 ); if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed) {
vsi_nn_PermuteTensor(self->graph, inputs[1], perm1, 4);
}
} }
#endif #endif
((vsi_nn_tensor_prv_t*)inputs[1])->processed = TRUE;
nn_param = &self->nn_param.deconv; nn_param = &self->nn_param.deconv;
nn_param->group = ( 0 == nn_param->group ) ? 1 : nn_param->group; nn_param->group = ( 0 == nn_param->group ) ? 1 : nn_param->group;

View File

@ -50,36 +50,12 @@ static vsi_status op_compute
vsi_status status = VSI_FAILURE; vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_param_t * param = NULL;
vsi_nn_kernel_node_t n = NULL; vsi_nn_kernel_node_t n = NULL;
uint32_t i = 0;
vsi_size_t block_size = 1, block_num = 1, axis_num = 0, indices_num = 1;
int32_t axis = self->nn_param.gather.axis; int32_t axis = self->nn_param.gather.axis;
int32_t batch_dims = self->nn_param.gather.batch_dims; int32_t batch_dims = self->nn_param.gather.batch_dims;
vsi_size_t *input_size = inputs[0]->attr.size;
uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num;
uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : inputs[1]->attr.dim_num;
param = vsi_nn_kernel_param_create(); param = vsi_nn_kernel_param_create();
for (i = 0; i < (uint32_t)axis; ++i)
{
block_size *= input_size[i];
}
axis_num = input_size[axis];
for (i = axis + 1; i < r_rank - batch_dims; ++i)
{
block_num *= input_size[i];
}
for (i = 0; i < q_rank - batch_dims; ++i)
{
indices_num *= inputs[1]->attr.size[i];
}
vsi_nn_kernel_param_add_int32( param, "block_size", (int32_t)block_size );
vsi_nn_kernel_param_add_int32( param, "block_num", (int32_t)block_num );
vsi_nn_kernel_param_add_int32( param, "axis_num", (int32_t)axis_num );
vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis ); vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis );
vsi_nn_kernel_param_add_int32( param, "indices_num", (int32_t)indices_num );
vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims ); vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims );
if (vsi_nn_is_same_data_type(inputs[0], outputs[0]) == FALSE || if (vsi_nn_is_same_data_type(inputs[0], outputs[0]) == FALSE ||

View File

@ -234,6 +234,10 @@ static vsi_bool op_setup_default
{ {
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
} }
else if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16)
{
attr.dtype.vx_type = VSI_NN_TYPE_BFLOAT16;
}
else else
{ {
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;

View File

@ -374,6 +374,17 @@ static vsi_bool op_setup
} }
} }
for ( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++)
{
if (inputs[LSTMUNIT_INPUT_WEIGHT_I2I + i] != NULL
&& p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i].qnt_type == VSI_NN_QNT_TYPE_NONE
&& p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i].vx_type == VSI_NN_TYPE_NONE
&& inputs[LSTMUNIT_INPUT_WEIGHT_I2I + i]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16)
{
p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i] = inputs[LSTMUNIT_INPUT_WEIGHT_I2I + i]->attr.dtype;
}
}
/* Input FC */ /* Input FC */
if( is_input_fc_on_tp ) if( is_input_fc_on_tp )
{ {

View File

@ -54,21 +54,12 @@ static vsi_status op_compute
vsi_status status = VSI_FAILURE; vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t *param = NULL; vsi_nn_kernel_param_t *param = NULL;
vsi_nn_kernel_node_t n = NULL; vsi_nn_kernel_node_t n = NULL;
vsi_nn_tensor_t * tmp_inputs[2] = {NULL};
vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
uint32_t new_rank[3] = {0};
vsi_bool ret = FALSE;
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
int32_t transposeA = self->nn_param.matrixmul.transpose[0]; int32_t transposeA = self->nn_param.matrixmul.transpose[0];
int32_t transposeB = self->nn_param.matrixmul.transpose[1]; int32_t transposeB = self->nn_param.matrixmul.transpose[1];
int32_t adjointA = self->nn_param.matrixmul.adjoint[0]; int32_t adjointA = self->nn_param.matrixmul.adjoint[0];
int32_t adjointB = self->nn_param.matrixmul.adjoint[1]; int32_t adjointB = self->nn_param.matrixmul.adjoint[1];
uint32_t cross_flg = 0;
uint32_t size_axis_inner_outer[3] = {0};
uint32_t stride_axis_inner_outer[9] = {0};
param = vsi_nn_kernel_param_create(); param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "transposeA", transposeA ); vsi_nn_kernel_param_add_int32( param, "transposeA", transposeA );
@ -76,52 +67,18 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_int32( param, "adjointA", adjointA ); vsi_nn_kernel_param_add_int32( param, "adjointA", adjointA );
vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB ); vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB );
n = vsi_nn_kernel_selector( self->graph, "matrixmul", inputs, 2, outputs, 1, param );
ret = vsi_nn_kernel_optimize_matrixmul_broadcast_shape(
inputs[0]->attr.size,
inputs[1]->attr.size,
outputs[0]->attr.size,
inputs[0]->attr.dim_num,
inputs[1]->attr.dim_num,
outputs[0]->attr.dim_num,
shapes[0], shapes[1], shapes[2], new_rank,
&cross_flg, size_axis_inner_outer, stride_axis_inner_outer);
if (ret)
{
vsi_nn_kernel_param_add_int32( param, "cross_flg", cross_flg );
vsi_nn_kernel_param_add_buffer( param, "size_axis_inner_outer", size_axis_inner_outer, 3);
vsi_nn_kernel_param_add_buffer( param, "stride_axis_inner_outer", stride_axis_inner_outer, 9);
tmp_inputs[0] = vsi_nn_reshape_tensor(self->graph, inputs[0], shapes[0], new_rank[0]);
tmp_inputs[1] = vsi_nn_reshape_tensor(self->graph, inputs[1], shapes[1], new_rank[1]);
tmp_outputs[0] = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes[2], new_rank[2]);
}
else
{
VSILOGE("illegal inputs shape");
status = VSI_FAILURE;
goto final;
}
n = vsi_nn_kernel_selector( self->graph, "matrixmul", tmp_inputs, 2, tmp_outputs, 1, param );
if ( n != NULL ) if ( n != NULL )
{ {
self->n = (vx_node)n; self->n = (vx_node)n;
status = VSI_SUCCESS; status = VSI_SUCCESS;
} }
final:
if (param != NULL) if (param != NULL)
{ {
vsi_nn_kernel_param_release( &param ); vsi_nn_kernel_param_release( &param );
} }
vsi_safe_release_tensor( tmp_inputs[0] );
vsi_safe_release_tensor( tmp_inputs[1] );
vsi_safe_release_tensor( tmp_outputs[0] );
return status; return status;
} /* op_compute() */ } /* op_compute() */

View File

@ -74,6 +74,20 @@ static vsi_bool op_check
return ret; return ret;
} /* op_check() */ } /* op_check() */
static vsi_status op_init
(
vsi_nn_node_t * self
)
{
vsi_status status = VSI_SUCCESS;
self->nn_param.max_pool3d.dilation[0] = 1;
self->nn_param.max_pool3d.dilation[1] = 1;
self->nn_param.max_pool3d.dilation[2] = 1;
return status;
} /* op_init() */
static vsi_status op_optimize static vsi_status op_optimize
( (
vsi_nn_node_t * self, vsi_nn_node_t * self,
@ -120,7 +134,7 @@ static vsi_bool op_setup
inputs[0]->attr.size, inputs[0]->attr.size,
ksize, ksize,
p->stride, p->stride,
NULL, p->dilation,
p->pad_type, p->pad_type,
pad pad
); );
@ -142,7 +156,7 @@ static vsi_bool op_setup
p->ksize[0], p->ksize[0],
&p->pad[0], &p->pad[0],
p->stride[0], p->stride[0],
0, p->dilation[0],
p->round_type p->round_type
); );
@ -152,7 +166,7 @@ static vsi_bool op_setup
p->ksize[1], p->ksize[1],
&p->pad[2], &p->pad[2],
p->stride[1], p->stride[1],
0, p->dilation[1],
p->round_type p->round_type
); );
@ -162,7 +176,7 @@ static vsi_bool op_setup
p->ksize[2], p->ksize[2],
&p->pad[4], &p->pad[4],
p->stride[2], p->stride[2],
0, p->dilation[2],
p->round_type p->round_type
); );
@ -210,6 +224,8 @@ static vsi_bool op_setup
curr->node->nn_param.pool.pad[1] = p->pad[1]; curr->node->nn_param.pool.pad[1] = p->pad[1];
curr->node->nn_param.pool.pad[2] = p->pad[2]; curr->node->nn_param.pool.pad[2] = p->pad[2];
curr->node->nn_param.pool.pad[3] = p->pad[3]; curr->node->nn_param.pool.pad[3] = p->pad[3];
curr->node->nn_param.pool.dilation[0] = p->dilation[0];
curr->node->nn_param.pool.dilation[1] = p->dilation[1];
curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX; curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX;
curr->node->nn_param.pool.round_type = p->round_type; curr->node->nn_param.pool.round_type = p->round_type;
curr->node->nn_param.pool.pad_type = p->pad_type; curr->node->nn_param.pool.pad_type = p->pad_type;
@ -265,6 +281,8 @@ static vsi_bool op_setup
curr->node->nn_param.pool.pad[1] = 0; curr->node->nn_param.pool.pad[1] = 0;
curr->node->nn_param.pool.pad[2] = p->pad[4]; curr->node->nn_param.pool.pad[2] = p->pad[4];
curr->node->nn_param.pool.pad[3] = p->pad[5]; curr->node->nn_param.pool.pad[3] = p->pad[5];
curr->node->nn_param.pool.dilation[0] = 1;
curr->node->nn_param.pool.dilation[1] = p->dilation[2];
curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX; curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX;
curr->node->nn_param.pool.round_type = p->round_type; curr->node->nn_param.pool.round_type = p->round_type;
curr->node->nn_param.pool.pad_type = p->pad_type; curr->node->nn_param.pool.pad_type = p->pad_type;
@ -305,7 +323,7 @@ __BEGIN_DECLS
DEF_OP_REG DEF_OP_REG
( (
/* op_name */ MAX_POOL3D, /* op_name */ MAX_POOL3D,
/* init */ NULL, /* init */ op_init,
/* compute */ op_compute, /* compute */ op_compute,
/* deinit */ op_deinit, /* deinit */ op_deinit,
/* check */ op_check, /* check */ op_check,

Some files were not shown because too many files have changed in this diff Show More