Update internal to 1.1.88 release (#657)
Internal ovxlib SHA 32fe479af5549e894bcd40de5740ae0dfd42bdb9 Type: Code Improvement Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
parent
10081790ee
commit
1bb1e070f2
|
|
@ -194,3 +194,4 @@ DEF_OP(INVERSE_SIGMOID)
|
|||
DEF_OP(GRID_SAMPLE)
|
||||
DEF_OP(LPNORM)
|
||||
DEF_OP(RESIZE_3D)
|
||||
DEF_OP(REDUCEL2)
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ typedef struct _vsi_nn_deconv_param
|
|||
uint32_t ksize[2];
|
||||
uint32_t stride[2];
|
||||
/* Pad left, right, top, bottom */
|
||||
uint32_t pad[4];
|
||||
int32_t pad[4];
|
||||
/* Pad type default value shall be AUTO */
|
||||
uint32_t pad_type;
|
||||
uint32_t weights;
|
||||
|
|
|
|||
|
|
@ -44,6 +44,7 @@ typedef struct _vsi_nn_max_pool3d_param
|
|||
uint32_t pad[6];
|
||||
/* Pad type default value shall be AUTO */
|
||||
vsi_nn_pad_e pad_type;
|
||||
uint32_t dilation[3];
|
||||
} vsi_nn_max_pool3d_param;
|
||||
_compiler_assert(offsetof(vsi_nn_max_pool3d_param, local) == 0, \
|
||||
vsi_nn_max_pool3d_h );
|
||||
|
|
|
|||
|
|
@ -30,11 +30,20 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_moments_lcl_data
|
||||
{
|
||||
vsi_bool use_internal_node;
|
||||
uint32_t perm[VSI_NN_MAX_DIM_NUM];
|
||||
int32_t axis[VSI_NN_MAX_DIM_NUM];
|
||||
} vsi_nn_moments_lcl_data;
|
||||
|
||||
typedef struct _vsi_nn_moments_param
|
||||
{
|
||||
const int32_t* axis;
|
||||
int32_t axis_num;
|
||||
vsi_bool keep_dim;
|
||||
|
||||
vsi_nn_moments_lcl_data *lcl_data;
|
||||
} vsi_nn_moments_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ typedef struct _vsi_nn_pool_param
|
|||
vsi_nn_pad_e pad_type;
|
||||
/* poolwithargmax layer local data structure */
|
||||
vsi_nn_pool_lcl_data *local;
|
||||
uint32_t dilation[2];
|
||||
} vsi_nn_pool_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -0,0 +1,47 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_REDUCEL2_H
|
||||
#define _VSI_NN_OP_REDUCEL2_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_reducel2_param
|
||||
{
|
||||
struct _reducel2_local_data_t * lcl;
|
||||
vx_int32 *axis;
|
||||
vx_uint32 axis_num;
|
||||
vx_bool keep_dim;
|
||||
} vsi_nn_reducel2_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -466,77 +466,148 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
|
|||
|
||||
static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
|
||||
float fp8_f32 = in / scale;
|
||||
int32_t fp8_i32 = *((int32_t*)&fp8_f32);
|
||||
//int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 3) - 1));
|
||||
int32_t eps = 1 << (23 - 3 - 1);
|
||||
fp8_i32 += eps;
|
||||
//fp8_i32 &= mask;
|
||||
{
|
||||
int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1;
|
||||
int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff;
|
||||
int expShiftValue = FLOAT8_E4M3_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT;
|
||||
int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7;
|
||||
int32_t in_val = *((int32_t*)&fp8_f32);
|
||||
|
||||
exp = (exp + expShiftValue) & 0xF;
|
||||
uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
|
||||
uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
|
||||
uint32_t in_man = (in_val & 0x7FFFFF); /* low 23 bits is man */
|
||||
|
||||
return (uint8_t)(sign << 7 | exp << 3 | mantissa);
|
||||
uint32_t out_sign = in_sign;
|
||||
int32_t out_exp = (in_exp + FLOAT8_E4M3_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT); /* in_exp - fp32bias + SE4M3 bias */
|
||||
uint32_t man_rounding = 0, out_man = 0, out_val = 0;
|
||||
|
||||
man_rounding = (in_man + 0x80000) >> 20; /* manrounding is 3 bits */
|
||||
if (((man_rounding >> 3) && 0x1) == 1) {
|
||||
/* when in_man like 0b11_1, exp += 1, mantissa is 0*/
|
||||
out_exp += 1;
|
||||
}
|
||||
|
||||
/* Clamp Denorm to zero */
|
||||
if (out_exp <= 0) {
|
||||
out_exp = 0;
|
||||
man_rounding = 0;
|
||||
out_sign = 0;
|
||||
}
|
||||
|
||||
out_man = man_rounding & 0x7; /* keep low 3 bits of man */
|
||||
/* overflow policy */
|
||||
if (out_exp >= 16 || (out_exp == 15 && out_man == 7)) {
|
||||
out_exp = 15;
|
||||
out_man = 6;
|
||||
#if 0
|
||||
if (mode == VX_CONVERT_POLICY_SATURATE) {
|
||||
out_exp = 15;
|
||||
out_man = 6;
|
||||
} else if (mode == VX_CONVERT_POLICY_INF) {
|
||||
out_exp = 15;
|
||||
out_man = 7;
|
||||
} else {
|
||||
vxmASSERT(0 && "Error overflow mode!\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
out_val = (out_sign << 7) | (out_exp << 3) | out_man;
|
||||
return (uint8_t)(out_val & 0xFF);
|
||||
} /* fp32_to_fp8_e4m3() */
|
||||
|
||||
static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) {
|
||||
float fp8_f32 = in / scale;
|
||||
int32_t fp8_i32 = *((int32_t*)&fp8_f32);
|
||||
//int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 2) - 1));
|
||||
int32_t eps = 1 << (23 - 2 - 1);
|
||||
fp8_i32 += eps;
|
||||
//fp8_i32 &= mask;
|
||||
{
|
||||
int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1;
|
||||
int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff;
|
||||
int expShiftValue = FLOAT8_E5M2_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT;
|
||||
int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x3;
|
||||
int32_t in_val = *((int32_t*)&fp8_f32);
|
||||
uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
|
||||
uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
|
||||
uint32_t in_man = (in_val & 0x7FFFFF); /* low 23 bits is man */
|
||||
|
||||
exp = (exp + expShiftValue) & 0x1F;
|
||||
uint32_t out_sign = in_sign;
|
||||
int32_t out_exp = (in_exp + FLOAT8_E5M2_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT); /* in_exp - fp32bias + SE5M2 bias */
|
||||
uint32_t man_rounding = 0, out_man = 0, out_val = 0;
|
||||
|
||||
return (uint8_t)(sign << 7 | exp << 2 | mantissa);
|
||||
man_rounding = (in_man + 0x100000) >> 21; /* manrounding is 2 bits */
|
||||
if (((man_rounding >> 2) && 0x1) == 1) {
|
||||
/* when in_man like 0b11, exp += 1, mantissa is 0*/
|
||||
out_exp += 1;
|
||||
}
|
||||
|
||||
/* Clamp Denorm to zero */
|
||||
if (out_exp <= 0) {
|
||||
out_exp = 0;
|
||||
man_rounding = 0;
|
||||
out_sign = 0;
|
||||
}
|
||||
|
||||
out_man = man_rounding & 0x3; /* keep low 9 bits of man */
|
||||
/* overflow policy */
|
||||
if (out_exp >= 31) {
|
||||
out_exp = 30;
|
||||
out_man = 3;
|
||||
#if 0
|
||||
if (mode == VX_CONVERT_POLICY_SATURATE) {
|
||||
out_exp = 30;
|
||||
out_man = 3;
|
||||
} else if (mode == VX_CONVERT_POLICY_INF) {
|
||||
out_exp = 31;
|
||||
out_man = 0;
|
||||
} else {
|
||||
vxmASSERT(0 && "Error overflow mode!\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
out_val = (out_sign << 7) | (out_exp << 2) | out_man;
|
||||
return (uint8_t)(out_val & 0xFF);
|
||||
} /* fp32_to_fp8_e5m2() */
|
||||
|
||||
static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
|
||||
float val_fp32;
|
||||
|
||||
uint32_t signOut = 0;
|
||||
uint32_t exponentOut = 0;
|
||||
uint32_t mantissaOut = 0;
|
||||
uint32_t out_u = 0;
|
||||
|
||||
uint32_t signIn;
|
||||
uint32_t exponentIn;
|
||||
uint32_t mantissaIn;
|
||||
int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E4M3_BIAS_EXPONENT;
|
||||
|
||||
signIn = (in >> (FLOAT8_E4M3_EXPONENT_SIZE + FLOAT8_E4M3_MANTISSA_SIZE)) & 0x1;
|
||||
exponentIn = (in >> FLOAT8_E4M3_MANTISSA_SIZE) & 0xF;
|
||||
mantissaIn = in & 0x7;
|
||||
|
||||
signOut = signIn;
|
||||
|
||||
if (exponentIn == 0 && mantissaIn == 0)
|
||||
{
|
||||
goto final;
|
||||
uint32_t signIn;
|
||||
uint32_t exponentIn;
|
||||
uint32_t mantissaIn;
|
||||
uint32_t expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E4M3_BIAS_EXPONENT;
|
||||
//uint32_t i = 0;
|
||||
//uint32_t intMsk = 0x4;
|
||||
|
||||
signIn = (in >> (FLOAT8_E4M3_EXPONENT_SIZE + FLOAT8_E4M3_MANTISSA_SIZE)) & 0x1;
|
||||
exponentIn = (in >> FLOAT8_E4M3_MANTISSA_SIZE) & 0xF;
|
||||
mantissaIn = in & 0x7;
|
||||
|
||||
signOut = signIn;
|
||||
|
||||
/* clamp subnorm*/
|
||||
if (exponentIn == 0) {
|
||||
goto final;
|
||||
}
|
||||
/*
|
||||
if (exponentIn == 0 && mantissaIn == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
else if (exponentIn == 0)
|
||||
{
|
||||
while (!(mantissaIn & intMsk))
|
||||
{
|
||||
intMsk >>= 1;
|
||||
++i;
|
||||
}
|
||||
exponentOut = (exponentIn + expShiftValue - i) & 0xff;
|
||||
mantissaIn = ((mantissaIn ^ intMsk) << (i + 1));
|
||||
mantissaOut = (mantissaIn << (FLOAT_MATISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff;
|
||||
break;
|
||||
}
|
||||
*/
|
||||
|
||||
if (exponentIn == 0xf && mantissaIn == 0x7) {
|
||||
exponentOut = 0xff;
|
||||
mantissaOut = 0x400000;
|
||||
goto final;
|
||||
}
|
||||
|
||||
exponentOut = (exponentIn + expShiftValue) & 0xff;
|
||||
mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff;
|
||||
}
|
||||
|
||||
if (exponentIn == 0xf && mantissaIn == 0x7)
|
||||
{
|
||||
exponentOut = 0xff;
|
||||
mantissaOut = 0x400000;
|
||||
goto final;
|
||||
}
|
||||
|
||||
exponentOut = (exponentIn + expShiftValue) & 0xff;
|
||||
mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff;
|
||||
|
||||
|
||||
final:
|
||||
out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
|
||||
val_fp32 = *((float*)&out_u);
|
||||
|
|
@ -546,44 +617,60 @@ final:
|
|||
|
||||
static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
|
||||
float val_fp32;
|
||||
|
||||
uint32_t signOut = 0;
|
||||
uint32_t exponentOut = 0;
|
||||
uint32_t mantissaOut = 0;
|
||||
uint32_t out_u = 0;
|
||||
|
||||
uint32_t signIn;
|
||||
uint32_t exponentIn;
|
||||
uint32_t mantissaIn;
|
||||
int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E5M2_BIAS_EXPONENT;
|
||||
|
||||
signIn = (in >> 7) & 0x1;
|
||||
exponentIn = (in >> 2) & 0x1F;
|
||||
mantissaIn = in & 0x3;
|
||||
|
||||
signOut = signIn;
|
||||
|
||||
if (exponentIn == 0 && mantissaIn == 0)
|
||||
{
|
||||
goto final;
|
||||
uint32_t signIn;
|
||||
uint32_t exponentIn;
|
||||
uint32_t mantissaIn;
|
||||
uint32_t expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E5M2_BIAS_EXPONENT;
|
||||
//uint32_t i = 0;
|
||||
//uint32_t intMsk = 0x2;
|
||||
|
||||
signIn = (in >> (FLOAT8_E5M2_EXPONENT_SIZE + FLOAT8_E5M2_MANTISSA_SIZE)) & 0x1;
|
||||
exponentIn = (in >> FLOAT8_E5M2_MANTISSA_SIZE) & 0x1F;
|
||||
mantissaIn = in & 0x3;
|
||||
|
||||
signOut = signIn;
|
||||
|
||||
/* clamp subnorm*/
|
||||
if (exponentIn == 0) {
|
||||
goto final;
|
||||
}
|
||||
/*
|
||||
if (exponentIn == 0 && mantissaIn == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
else if (exponentIn == 0)
|
||||
{
|
||||
while (!(mantissaIn & intMsk))
|
||||
{
|
||||
intMsk >>= 1;
|
||||
++i;
|
||||
}
|
||||
exponentOut = (exponentIn + expShiftValue - i) & 0xff;
|
||||
mantissaIn = ((mantissaIn ^ intMsk) << (i + 1));
|
||||
mantissaOut = (mantissaIn << (FLOAT_MATISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff;
|
||||
break;
|
||||
}
|
||||
*/
|
||||
|
||||
if (exponentIn == 0x1f && mantissaIn == 0x3) {
|
||||
exponentOut = 0xff;
|
||||
mantissaOut = 0x400000;
|
||||
goto final;
|
||||
}
|
||||
|
||||
exponentOut = (exponentIn + expShiftValue) & 0xff;
|
||||
mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff;
|
||||
}
|
||||
|
||||
if (exponentIn == 0x1f && mantissaIn == 0x3)
|
||||
{
|
||||
exponentOut = 0xff;
|
||||
mantissaOut = 0x400000;
|
||||
goto final;
|
||||
}
|
||||
|
||||
|
||||
exponentOut = (exponentIn + expShiftValue) & 0xff;
|
||||
mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff;
|
||||
|
||||
|
||||
final:
|
||||
final:
|
||||
out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
|
||||
val_fp32 = *((float*)&out_u);
|
||||
|
||||
return val_fp32 * scale;
|
||||
} /* fp8_e5m2_to_fp32() */
|
||||
|
||||
|
|
|
|||
|
|
@ -241,7 +241,7 @@ OVXLIB_API vsi_status vsi_nn_VerifyGraph
|
|||
*/
|
||||
OVXLIB_API vsi_status vsi_nn_RunGraph
|
||||
(
|
||||
const vsi_nn_graph_t * graph
|
||||
vsi_nn_graph_t * graph
|
||||
);
|
||||
|
||||
/**
|
||||
|
|
@ -273,7 +273,7 @@ OVXLIB_API vsi_status vsi_nn_AsyncRunGraph
|
|||
|
||||
OVXLIB_API vsi_status vsi_nn_AsyncRunWait
|
||||
(
|
||||
vsi_nn_graph_t * graph
|
||||
vsi_nn_graph_t * graph
|
||||
);
|
||||
|
||||
/**
|
||||
|
|
@ -556,7 +556,7 @@ OVXLIB_API vsi_bool vsi_nn_SetGraphOutputs
|
|||
* @param[in] graph Graph handle
|
||||
* @param[in] id Node id to be removed.
|
||||
*/
|
||||
void vsi_nn_RemoveNode
|
||||
OVXLIB_API void vsi_nn_RemoveNode
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_node_id_t id
|
||||
|
|
@ -788,6 +788,14 @@ OVXLIB_API vsi_status vsi_nn_ExecuteGraphLoop
|
|||
vsi_nn_graph_t* graph,
|
||||
vsi_nn_tensor_t *max_iteration_tensor
|
||||
);
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
const char* ctrl_str,
|
||||
size_t size
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ vx_tensor vsi_nn_CreateRawTensorFromData
|
|||
vsi_nn_tensor_attr_t * attr
|
||||
);
|
||||
|
||||
vsi_status vsi_nn_OptimizeGraph
|
||||
OVXLIB_API vsi_status vsi_nn_OptimizeGraph
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
vsi_bool *dirty
|
||||
|
|
|
|||
|
|
@ -208,6 +208,7 @@
|
|||
#include "ops/vsi_nn_op_grid_sample.h"
|
||||
#include "ops/vsi_nn_op_lpnorm.h"
|
||||
#include "ops/vsi_nn_op_resize_3d.h"
|
||||
#include "ops/vsi_nn_op_reducel2.h"
|
||||
/* custom node head define define */
|
||||
#include "custom/vsi_nn_custom_node_type.h"
|
||||
#include "ops/vsi_nn_op_inverse_sigmoid.h"
|
||||
|
|
@ -404,6 +405,7 @@ typedef union _vsi_nn_nn_param
|
|||
vsi_nn_grid_sample_param gridsample;
|
||||
vsi_nn_lpnorm_param lpnorm;
|
||||
vsi_nn_resize_3d_param resize_3d;
|
||||
vsi_nn_reducel2_param reducel2;
|
||||
void* client_param;
|
||||
|
||||
/* custom node data struct define */
|
||||
|
|
|
|||
|
|
@ -268,7 +268,7 @@ vsi_status vsi_nn_OpOptimize
|
|||
*
|
||||
* @return VSI_SUCCESS on success, or error code otherwise.
|
||||
*/
|
||||
vsi_bool vsi_nn_OpCheck
|
||||
OVXLIB_API vsi_bool vsi_nn_OpCheck
|
||||
(
|
||||
vsi_nn_op_t op,
|
||||
vsi_nn_node_t * node,
|
||||
|
|
|
|||
|
|
@ -264,6 +264,14 @@ OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
|
|||
uint32_t enable_nodes_count
|
||||
);
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParamForCropOnly
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
vsi_nn_node_id_t* enable_nodes,
|
||||
vsi_bool* crop_set_start_only,
|
||||
uint32_t enable_nodes_count
|
||||
);
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
|
|
|
|||
|
|
@ -614,6 +614,13 @@ OVXLIB_API vsi_status vsi_nn_SwapTensorHandle
|
|||
vsi_nn_tensor_t * tensor1
|
||||
);
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_SwapTensorHandleWithCache
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t * tensor0,
|
||||
vsi_nn_tensor_t * tensor1
|
||||
);
|
||||
|
||||
OVXLIB_API vsi_size_t vsi_nn_vxGetTensorElementNum
|
||||
(
|
||||
vsi_nn_tensor_attr_t *attr
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ extern "C"{
|
|||
|
||||
#define VSI_NN_VERSION_MAJOR 1
|
||||
#define VSI_NN_VERSION_MINOR 1
|
||||
#define VSI_NN_VERSION_PATCH 84
|
||||
#define VSI_NN_VERSION_PATCH 88
|
||||
#define VSI_NN_VERSION \
|
||||
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
|
||||
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ __BEGIN_DECLS
|
|||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (1)
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.custom_warp_affine")
|
||||
|
|
@ -54,6 +54,7 @@ __BEGIN_DECLS
|
|||
static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
|
|
@ -66,8 +67,9 @@ static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
|
|||
// Add kererl parameters here
|
||||
};
|
||||
#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def )
|
||||
#define SCALAR_INPUT_TYPE (2)
|
||||
#define SCALAR_MATRIX_OFFSET (3)
|
||||
#define SCALAR_INPUT_TYPE (3)
|
||||
#define SCALAR_MATRIX_OFFSET (4)
|
||||
#define SCALAR_INPUT_RGB_TYPE (10)
|
||||
|
||||
static void _transform_affine
|
||||
(
|
||||
|
|
@ -142,44 +144,60 @@ DEF_KERNEL_EXECUTOR(_compute)
|
|||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
|
||||
|
||||
/* alloc the float32 data buffer */
|
||||
buffer[1] = (float *)malloc(out_elements * sizeof(float));
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
|
||||
memset(buffer[1], 0, out_elements * sizeof(float));
|
||||
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
|
||||
|
||||
if (tensors[1])
|
||||
{
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
|
||||
}
|
||||
|
||||
buffer[2] = (float *)malloc(out_elements * sizeof(float));
|
||||
CHECK_PTR_FAIL_GOTO( buffer[2], "Create input buffer fail.", final );
|
||||
memset(buffer[2], 0, out_elements * sizeof(float));
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE],
|
||||
&type);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rgb_type);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_RGB_TYPE], &rgb_type);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
for (i = 0; i < 6; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
|
||||
&matrix[i]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
if (buffer[1] == NULL)
|
||||
{
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
|
||||
&matrix[i]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
matrix[i] = buffer[1][i];
|
||||
}
|
||||
}
|
||||
|
||||
width = attr[1]->shape->data[0];
|
||||
height = attr[1]->shape->data[1];
|
||||
for(i = 2; i < (vsi_size_t)attr[1]->shape->size; ++i)
|
||||
width = attr[2]->shape->data[0];
|
||||
height = attr[2]->shape->data[1];
|
||||
for(i = 2; i < (vsi_size_t)attr[2]->shape->size; ++i)
|
||||
{
|
||||
outer_size *= attr[1]->shape->data[i];
|
||||
outer_size *= attr[2]->shape->data[i];
|
||||
}
|
||||
// Do something
|
||||
for (b = 0; b < outer_size; b++)
|
||||
{
|
||||
float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1];
|
||||
float *dst_base = buffer[1] + b * width * height;
|
||||
float *dst_base = buffer[2] + b * width * height;
|
||||
|
||||
if ( rgb_type == VSI_NN_WARP_AFFINE_TYPE_RGB )
|
||||
{
|
||||
|
|
@ -274,8 +292,8 @@ DEF_KERNEL_EXECUTOR(_compute)
|
|||
}
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
|
||||
buffer[1], out_elements );
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
|
||||
buffer[2], out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
final:
|
||||
for( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
|
|
@ -350,7 +368,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &buffer[i] );
|
||||
}
|
||||
node_params[9] = vsi_nn_kernel_scalar_create(
|
||||
node_params[SCALAR_INPUT_RGB_TYPE] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &rgb_type );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
|
|
@ -360,7 +378,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
|
||||
}
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_RGB_TYPE] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
|
|
|
|||
|
|
@ -49,29 +49,52 @@ typedef enum _custom_warp_affine_type_e
|
|||
bilinear = VSI_NN_INTERPOLATION_BILINEAR,
|
||||
}custom_warp_affine_type_e;
|
||||
|
||||
#define _CUSTOM_WARP_AFFINE_2D_KERNEL_SOURCE "custom_warp_affine_2d"
|
||||
#define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE "custom_warp_affine"
|
||||
#define _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE "custom_warp_affine_rgb"
|
||||
#define _CUSTOM_WARP_AFFINE_OPTIONAL_KERNEL_SOURCE "custom_warp_affine_optional"
|
||||
#define _CUSTOM_WARP_AFFINE_RGB_OPTIONAL_KERNEL_SOURCE "custom_warp_affine_rgb_optional"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D, RGB_TYPE ) \
|
||||
(( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20) | (RGB_TYPE << 24))
|
||||
#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D, RGB_TYPE, OPTIONAL_INTPUT ) \
|
||||
(( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20) | \
|
||||
(RGB_TYPE << 24) | (OPTIONAL_INTPUT << 28))
|
||||
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0 ), \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0, 0 ), \
|
||||
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \
|
||||
_CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
|
||||
#define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0 ), \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0, 0 ), \
|
||||
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \
|
||||
_CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
|
||||
_CUSTOM_WARP_AFFINE_2D_KERNEL_SOURCE }
|
||||
|
||||
#define PACK_OPTIONAL_INPUT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0, 1 ), \
|
||||
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_optional_input"), \
|
||||
_CUSTOM_WARP_AFFINE_OPTIONAL_KERNEL_SOURCE }
|
||||
#define PACK_OPTIONAL_INPUT_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0, 1 ), \
|
||||
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D_optional_input"), \
|
||||
_CUSTOM_WARP_AFFINE_OPTIONAL_KERNEL_SOURCE }
|
||||
|
||||
#define PACK_RGB_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1 ), \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1, 0 ), \
|
||||
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb"), \
|
||||
_CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE }
|
||||
#define PACK_RGB_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1 ), \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1, 0 ), \
|
||||
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_2D"), \
|
||||
_CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE }
|
||||
|
||||
#define PACK_OPTIONAL_INPUT_RGB_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1, 1 ), \
|
||||
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_optional_input"), \
|
||||
_CUSTOM_WARP_AFFINE_RGB_OPTIONAL_KERNEL_SOURCE }
|
||||
#define PACK_RGB_2D_OPTIONAL_INPUT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
|
||||
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1, 1 ), \
|
||||
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_2D_optional_input"), \
|
||||
_CUSTOM_WARP_AFFINE_RGB_OPTIONAL_KERNEL_SOURCE }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
|
|
@ -84,15 +107,23 @@ static const _kernel_map_type _custom_warp_affine_kernel_map[] =
|
|||
// Register kernel here
|
||||
PACK_KERNEL_MAP( U8, U8, nearest_neighbor ),
|
||||
PACK_KERNEL_MAP( U8, U8, bilinear ),
|
||||
PACK_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, nearest_neighbor ),
|
||||
PACK_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, bilinear ),
|
||||
|
||||
PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
|
||||
PACK_2D_KERNEL_MAP( U8, U8, bilinear ),
|
||||
PACK_OPTIONAL_INPUT_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
|
||||
PACK_OPTIONAL_INPUT_2D_KERNEL_MAP( U8, U8, bilinear ),
|
||||
|
||||
PACK_RGB_KERNEL_MAP( U8, U8, nearest_neighbor ),
|
||||
PACK_RGB_KERNEL_MAP( U8, U8, bilinear ),
|
||||
PACK_OPTIONAL_INPUT_RGB_KERNEL_MAP( U8, U8, nearest_neighbor ),
|
||||
PACK_OPTIONAL_INPUT_RGB_KERNEL_MAP( U8, U8, bilinear ),
|
||||
|
||||
PACK_RGB_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
|
||||
PACK_RGB_2D_KERNEL_MAP( U8, U8, bilinear ),
|
||||
PACK_RGB_2D_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, nearest_neighbor ),
|
||||
PACK_RGB_2D_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, bilinear ),
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -110,8 +141,21 @@ static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
|
|||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def )
|
||||
#define SCALAR_MATRIX_OFFSET (2)
|
||||
|
||||
static vx_param_description_t _custom_warp_affine_optinal_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_optinal_kernel_param_def )
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
|
|
@ -138,17 +182,21 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
|
|||
float matrix1[4] = {0};
|
||||
float matrix4[4] = {0};
|
||||
int32_t i = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
uint32_t scalar_matrix_offset = 3;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[param_size - 7] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
if (param_size == 8)
|
||||
{
|
||||
scalar_matrix_offset = 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < 6; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[scalar_matrix_offset + i],
|
||||
&m[i]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
|
@ -170,13 +218,16 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
|
|||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"matrix0", &matrix0 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"matrix1", &matrix1 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"matrix4", &matrix4 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
if (param_size == 8)
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"matrix0", &matrix0 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"matrix1", &matrix1 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"matrix4", &matrix4 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
|
|
@ -217,17 +268,21 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_rgb_initializer)
|
|||
float matrix0[4] = {0};
|
||||
float matrix1[4] = {0};
|
||||
int32_t i = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
uint32_t scalar_matrix_offset = 3;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[param_size - 7] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
if (param_size == 8)
|
||||
{
|
||||
scalar_matrix_offset = 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < 6; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[scalar_matrix_offset + i],
|
||||
&m[i]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
|
@ -248,11 +303,14 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_rgb_initializer)
|
|||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"matrix0", &matrix0 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"matrix1", &matrix1 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
if (param_size == 8)
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"matrix0", &matrix0 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"matrix1", &matrix1 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
|
|
@ -280,7 +338,8 @@ static vsi_status _query_kernel
|
|||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t type,
|
||||
int32_t rgb_type
|
||||
int32_t rgb_type,
|
||||
int32_t optional_input
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
|
@ -289,6 +348,7 @@ static vsi_status _query_kernel
|
|||
const _kernel_map_type * kernel_map = _custom_warp_affine_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _custom_warp_affine_kernel_map );
|
||||
vx_param_description_t * param_def = _custom_warp_affine_kernel_param_def;
|
||||
size_t param_def_size = _cnt_of_array( _custom_warp_affine_kernel_param_def );
|
||||
vx_kernel_initialize_f initializer = _custom_warp_affine_initializer;
|
||||
int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1;
|
||||
uint32_t key = 0;
|
||||
|
|
@ -297,7 +357,12 @@ static vsi_status _query_kernel
|
|||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img, rgb_type );
|
||||
if (optional_input == 1)
|
||||
{
|
||||
param_def = _custom_warp_affine_optinal_kernel_param_def;
|
||||
param_def_size = _cnt_of_array(_custom_warp_affine_optinal_kernel_param_def);
|
||||
}
|
||||
key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img, rgb_type, optional_input );
|
||||
if (rgb_type == 1)
|
||||
{
|
||||
initializer = _custom_warp_affine_rgb_initializer;
|
||||
|
|
@ -313,7 +378,7 @@ static vsi_status _query_kernel
|
|||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _custom_warp_affine_kernel_param_def );
|
||||
kernel->info.numParams = (vx_uint32)param_def_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
|
|
@ -348,13 +413,23 @@ static vsi_nn_kernel_node_t _setup
|
|||
int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
|
||||
int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type");
|
||||
float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
|
||||
int32_t optional_input = 1;
|
||||
uint32_t scalar_matrix_offset = 3;
|
||||
uint32_t param_num = _CUSTOM_WARP_AFFINE_PARAM_NUM;
|
||||
if (inputs[1] == NULL)
|
||||
{
|
||||
optional_input = 0;
|
||||
input_num = 1;
|
||||
scalar_matrix_offset = scalar_matrix_offset - 1;
|
||||
param_num = param_num - 1;
|
||||
}
|
||||
|
||||
if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, type, rgb_type );
|
||||
status = _query_kernel( kernel, inputs, outputs, type, rgb_type, optional_input );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -364,19 +439,20 @@ static vsi_nn_kernel_node_t _setup
|
|||
border.mode = VX_BORDER_CONSTANT;
|
||||
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM,
|
||||
vsi_nn_kernel_node_pack_io( node_params, param_num,
|
||||
inputs, input_num, outputs, output_num );
|
||||
for (i = 0; i < buffer_size; i++)
|
||||
{
|
||||
node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
|
||||
node_params[scalar_matrix_offset + i] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &buffer[i] );
|
||||
}
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
|
||||
for (i = 0; i < buffer_size; i++)
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[scalar_matrix_offset + i] );
|
||||
}
|
||||
|
||||
// Set default border mode.
|
||||
border.constant_value.U32 = 0x00000000;
|
||||
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ typedef struct _custom_warp_affine_local_data_t {
|
|||
/*
|
||||
Declare number of input and output.
|
||||
*/
|
||||
#define _INPUT_NUM (1)
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (1)
|
||||
|
||||
static vsi_status op_compute
|
||||
|
|
@ -63,7 +63,7 @@ static vsi_status op_compute
|
|||
|
||||
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
|
||||
"custom_warp_affine",
|
||||
inputs, 1,
|
||||
inputs, 2,
|
||||
outputs, 1, param );
|
||||
|
||||
vsi_nn_kernel_param_release( ¶m );
|
||||
|
|
|
|||
|
|
@ -269,7 +269,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
ret = vsi_nn_kernel_optimize_element_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);
|
||||
|
||||
if ( ret )
|
||||
if ( !ret )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -297,21 +297,13 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
if (axis < 0)
|
||||
{
|
||||
axis_new = 0;
|
||||
shapes[0][0] = 1;
|
||||
shapes[0][1] = 1;
|
||||
for (i = 0; i < inputs[0]->attr.dim_num; i++)
|
||||
{
|
||||
shapes[0][0] *= inputs[0]->attr.size[i];
|
||||
}
|
||||
rs_dim = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
vsi_nn_kernel_optimize_softmax_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
||||
shapes[0], &rs_dim, &axis_new);
|
||||
axis += (int32_t)inputs[0]->attr.dim_num;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_optimize_softmax_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
||||
shapes[0], &rs_dim, &axis_new);
|
||||
|
||||
if (rs_dim > 3)
|
||||
{
|
||||
return NULL;
|
||||
|
|
|
|||
|
|
@ -327,19 +327,40 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
|
||||
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
|
||||
int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" );
|
||||
int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" );
|
||||
int32_t indices_num = vsi_nn_kernel_param_get_int32( params, "indices_num" );
|
||||
int32_t block_size = 1;
|
||||
int32_t block_num = 1;
|
||||
int32_t axis_num = 0;
|
||||
int32_t indices_num = 1;
|
||||
int32_t is_batch = batch_dims > 0 ? 1 : 0;
|
||||
vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3;
|
||||
int32_t is_array = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0;
|
||||
int32_t i = 0;
|
||||
int32_t is_array = 0;
|
||||
uint32_t i = 0;
|
||||
vsi_size_t *input_size = inputs[0]->attr.size;
|
||||
uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num;
|
||||
uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : inputs[1]->attr.dim_num;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
for (i = 0; i < (uint32_t)axis; ++i)
|
||||
{
|
||||
block_size *= (int32_t)input_size[i];
|
||||
}
|
||||
|
||||
axis_num = (int32_t)input_size[axis];
|
||||
for (i = axis + 1; i < r_rank - batch_dims; ++i)
|
||||
{
|
||||
block_num *= (int32_t)input_size[i];
|
||||
}
|
||||
for (i = 0; i < q_rank - batch_dims; ++i)
|
||||
{
|
||||
indices_num *= (int32_t)inputs[1]->attr.size[i];
|
||||
}
|
||||
|
||||
is_array = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0;
|
||||
|
||||
status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array);
|
||||
status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
|
||||
status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array);
|
||||
|
|
|
|||
|
|
@ -60,8 +60,13 @@ __BEGIN_DECLS
|
|||
HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, F32, F32), \
|
||||
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
|
||||
|
||||
#define TENSOR_LOG_SOFTMAX_BFLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
|
||||
HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
|
||||
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
|
||||
|
||||
#define HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D")
|
||||
CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D")
|
||||
|
||||
#define TENSOR_LOG_SOFTMAX_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
|
||||
|
|
@ -73,6 +78,11 @@ __BEGIN_DECLS
|
|||
HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, F32, F32), \
|
||||
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
|
||||
|
||||
#define TENSOR_LOG_SOFTMAX_BFLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
|
||||
HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
|
||||
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
|
|
@ -85,11 +95,16 @@ static const struct {
|
|||
TENSOR_LOG_SOFTMAX_FLOAT(0, F16, F16)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT(1, F16, F16)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT(2, F16, F16)
|
||||
TENSOR_LOG_SOFTMAX_BFLOAT(0, BF16, BF16)
|
||||
TENSOR_LOG_SOFTMAX_BFLOAT(1, BF16, BF16)
|
||||
TENSOR_LOG_SOFTMAX_BFLOAT(2, BF16, BF16)
|
||||
|
||||
TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F32, F32)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F32, F32)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F16, F16)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F16, F16)
|
||||
TENSOR_LOG_SOFTMAX_BFLOAT_2D(0, BF16, BF16)
|
||||
TENSOR_LOG_SOFTMAX_BFLOAT_2D(1, BF16, BF16)
|
||||
|
||||
TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8)
|
||||
TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8)
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -1572,8 +1571,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
if (outputs[LSTMUNIT_ACT_OUTPUT] && VSI_NN_TYPE_UINT8 == outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.vx_type)
|
||||
{
|
||||
scale_val[8] = 1.0f / vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_OUTPUT]);
|
||||
tail_val[8] = (float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_OUTPUT]);
|
||||
scale_val[8] = 1.0f / vsi_nn_get_tensor_scale(outputs[LSTMUNIT_ACT_OUTPUT]);
|
||||
tail_val[8] = (float)vsi_nn_get_tensor_zero_point(outputs[LSTMUNIT_ACT_OUTPUT]);
|
||||
}
|
||||
|
||||
if( VSI_SUCCESS == status)
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@
|
|||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -44,6 +45,7 @@ __BEGIN_DECLS
|
|||
#define KERNEL_SOURCE_1 "matrixmul"
|
||||
#define KERNEL_SOURCE_2 "matrixmul_transA"
|
||||
#define KERNEL_SOURCE_3 "matrixmul_cross"
|
||||
#define KERNEL_SOURCE_4 "matrixmul_4x"
|
||||
|
||||
typedef enum
|
||||
{
|
||||
|
|
@ -51,8 +53,9 @@ __BEGIN_DECLS
|
|||
_3D
|
||||
} vsi_nn_kernel_image_dim_type_e;
|
||||
|
||||
#define HASH_MATRIXMUL_KEY(_type0, _type1, _type2, _image_dim, _trans_a, _cross) \
|
||||
((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_image_dim << 4) | (_trans_a << 2) | (_cross))
|
||||
#define HASH_MATRIXMUL_KEY(_type0, _type1, _type2, _image_dim, flag_4x, _trans_a, _cross) \
|
||||
((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_image_dim << 6) | \
|
||||
(flag_4x << 4) | (_trans_a << 2) | (_cross))
|
||||
|
||||
#define HASH_MATRIXMUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
|
||||
CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
|
||||
|
|
@ -66,23 +69,39 @@ __BEGIN_DECLS
|
|||
#define HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_merge")
|
||||
|
||||
#define HASH_MATRIXMUL_4X_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
|
||||
CVIVANTE_NAMESPACE("cl.gemm_4x_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
|
||||
|
||||
#define HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
|
||||
CVIVANTE_NAMESPACE("cl.gemm_4x_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
|
||||
|
||||
#define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
|
||||
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0), \
|
||||
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0, 0), \
|
||||
HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_MATRIXMUL_4X_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
|
||||
{HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 0, 0), \
|
||||
HASH_MATRIXMUL_4X_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_MATRIXMUL_4X_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
|
||||
{HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 1, 0), \
|
||||
HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
|
||||
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 0), \
|
||||
{HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 1, 0), \
|
||||
HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
|
||||
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2, 0), \
|
||||
{HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 2, 0), \
|
||||
HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_MATRIXMUL_MERGE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
|
||||
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 2), \
|
||||
{HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0, 2), \
|
||||
HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
|
|
@ -92,35 +111,37 @@ static const struct {
|
|||
const char* source_name;
|
||||
} matrixmul_map[] =
|
||||
{
|
||||
TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_MERGE_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_3)
|
||||
TENSOR_MATRIXMUL_MERGE_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_3)
|
||||
TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_3)
|
||||
TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_2)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_1)
|
||||
TENSOR_MATRIXMUL_MERGE_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_3)
|
||||
TENSOR_MATRIXMUL_MERGE_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_3)
|
||||
TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_3)
|
||||
TENSOR_MATRIXMUL_4X_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4)
|
||||
TENSOR_MATRIXMUL_4X_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4)
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -252,12 +273,53 @@ final:
|
|||
return status;
|
||||
} /* _matrixmul_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_matrixmul_4x_initializer)
|
||||
(vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t* param,
|
||||
size_t param_size) {
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr = NULL;
|
||||
vsi_size_t width = 0;
|
||||
vsi_size_t height = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr =
|
||||
vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
|
||||
CHECK_PTR_FAIL_GOTO(attr, "Create tensor attr buffer fail.", final);
|
||||
|
||||
width = attr->shape->data[0];
|
||||
height = attr->shape->data[1];
|
||||
|
||||
gpu_param.dim = 2;
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];
|
||||
gpu_param.global_size[1] = height;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
final:
|
||||
if (attr) {
|
||||
vsi_nn_kernel_tensor_attr_release(&attr);
|
||||
attr = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _matrixmul_4x_initializer() */
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_size_t depth,
|
||||
int32_t flag_4x,
|
||||
int32_t transa,
|
||||
int32_t cross
|
||||
)
|
||||
|
|
@ -317,7 +379,7 @@ static vsi_status _query_kernel
|
|||
output_dtype = U8;
|
||||
}
|
||||
|
||||
key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa, cross );
|
||||
key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, flag_4x, transa, cross );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ )
|
||||
{
|
||||
|
|
@ -340,7 +402,13 @@ static vsi_status _query_kernel
|
|||
kernel->info.parameters = _matrixmul_merge_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def );
|
||||
}
|
||||
kernel->info.initialize = _matrixmul_initializer;
|
||||
|
||||
if (flag_4x) {
|
||||
kernel->info.initialize = _matrixmul_4x_initializer;
|
||||
} else {
|
||||
kernel->info.initialize = _matrixmul_initializer;
|
||||
}
|
||||
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
|
|
@ -352,6 +420,8 @@ static vsi_status _query_kernel
|
|||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
|
|
@ -368,8 +438,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" );
|
||||
int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" );
|
||||
int32_t cross_flg = vsi_nn_kernel_param_get_int32( params, "cross_flg" );
|
||||
int32_t transFlg = 0;
|
||||
int32_t flag_4x = 0;
|
||||
vsi_size_t M = inputs[0]->attr.size[1];
|
||||
vsi_size_t K = inputs[0]->attr.size[0];
|
||||
vsi_size_t N = inputs[1]->attr.size[0];
|
||||
|
|
@ -385,6 +455,22 @@ static vsi_nn_kernel_node_t _setup
|
|||
float scale_out = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float zp_out = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
int32_t outer = 0;
|
||||
vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
|
||||
uint32_t final_rank = 0;
|
||||
vsi_nn_tensor_t* rs_in_tensors = NULL;
|
||||
vsi_nn_tensor_t* rs_out_tensors = NULL;
|
||||
vsi_nn_tensor_t* final_in_tensors[2] = {NULL};
|
||||
vsi_nn_tensor_t* final_out_tensors[1] = {NULL};
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e input1_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
uint32_t new_rank[3] = {0};
|
||||
uint32_t cross_flg = 0;
|
||||
uint32_t size_axis_in_out[3] = {0};
|
||||
uint32_t stride_axis_in_out[9] = {0};
|
||||
vsi_nn_tensor_t* tmp_inputs[2] = {NULL};
|
||||
vsi_nn_tensor_t* tmp_outputs[1] = {NULL};
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
|
@ -397,6 +483,33 @@ static vsi_nn_kernel_node_t _setup
|
|||
return NULL;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_optimize_matrixmul_broadcast_shape(
|
||||
inputs[0]->attr.size,
|
||||
inputs[1]->attr.size,
|
||||
outputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num,
|
||||
inputs[1]->attr.dim_num,
|
||||
outputs[0]->attr.dim_num,
|
||||
shapes[0], shapes[1], shapes[2], new_rank,
|
||||
&cross_flg, size_axis_in_out, stride_axis_in_out);
|
||||
if (status)
|
||||
{
|
||||
tmp_inputs[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], new_rank[0]);
|
||||
tmp_inputs[1] = vsi_nn_reshape_tensor(graph, inputs[1], shapes[1], new_rank[1]);
|
||||
tmp_outputs[0] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[2], new_rank[2]);
|
||||
|
||||
M = tmp_inputs[0]->attr.size[1];
|
||||
K = tmp_inputs[0]->attr.size[0];
|
||||
N = tmp_inputs[1]->attr.size[0];
|
||||
depth = tmp_outputs[0]->attr.dim_num > 2 ? tmp_outputs[0]->attr.size[2] : 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
VSILOGE("illegal inputs shape");
|
||||
status = VSI_FAILURE;
|
||||
goto final;
|
||||
}
|
||||
|
||||
if (transposeB)
|
||||
{
|
||||
N = inputs[1]->attr.size[1];
|
||||
|
|
@ -410,8 +523,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
transFlg = 1;
|
||||
}
|
||||
|
||||
a_depth = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
|
||||
b_depth = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1;
|
||||
a_depth = tmp_inputs[0]->attr.dim_num > 2 ? tmp_inputs[0]->attr.size[2] : 1;
|
||||
b_depth = tmp_inputs[1]->attr.dim_num > 2 ? tmp_inputs[1]->attr.size[2] : 1;
|
||||
|
||||
if (b_depth == 1)
|
||||
{
|
||||
|
|
@ -422,14 +535,14 @@ static vsi_nn_kernel_node_t _setup
|
|||
ac2zero = 1;
|
||||
}
|
||||
|
||||
if (inputs[0]->attr.dim_num == 4 && inputs[1]->attr.dim_num == 3
|
||||
if (tmp_inputs[0]->attr.dim_num == 4 && tmp_inputs[1]->attr.dim_num == 3
|
||||
&& a_depth > 1 && b_depth > 1 && cross_flg == 2)
|
||||
{
|
||||
ac2zero = 1;
|
||||
bc2zero = 0;
|
||||
outer = (int32_t)a_depth;
|
||||
}
|
||||
else if (inputs[1]->attr.dim_num == 4 && inputs[0]->attr.dim_num == 3
|
||||
else if (tmp_inputs[1]->attr.dim_num == 4 && tmp_inputs[0]->attr.dim_num == 3
|
||||
&& a_depth > 1 && b_depth > 1 && cross_flg == 2)
|
||||
{
|
||||
ac2zero = 0;
|
||||
|
|
@ -437,7 +550,46 @@ static vsi_nn_kernel_node_t _setup
|
|||
outer = (int32_t)b_depth;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, depth, transFlg, cross_flg );
|
||||
final_in_tensors[0] = tmp_inputs[0];
|
||||
final_in_tensors[1] = tmp_inputs[1];
|
||||
final_out_tensors[0] = tmp_outputs[0];
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype(tmp_inputs[0]->attr.dtype.vx_type);
|
||||
input1_dtype = vsi_nn_kernel_map_dtype(tmp_inputs[1]->attr.dtype.vx_type);
|
||||
output_dtype = vsi_nn_kernel_map_dtype(tmp_outputs[0]->attr.dtype.vx_type);
|
||||
|
||||
|
||||
if (((transFlg == 0) || (transFlg == 1)) && (cross_flg == 0) &&
|
||||
(F32 == input0_dtype) && (F32 == input1_dtype) && (F32 == output_dtype))
|
||||
{
|
||||
vsi_size_t in1_w = tmp_inputs[1]->attr.size[0];
|
||||
vsi_size_t in1_h = tmp_inputs[1]->attr.size[1];
|
||||
vsi_size_t in1_c = tmp_inputs[1]->attr.dim_num > 2 ? tmp_inputs[1]->attr.size[2] : 1;
|
||||
vsi_size_t in1_n = tmp_inputs[1]->attr.dim_num > 3 ? tmp_inputs[1]->attr.size[3] : 1;
|
||||
vsi_size_t out_w = tmp_outputs[0]->attr.size[0];
|
||||
vsi_size_t out_h = tmp_outputs[0]->attr.size[1];
|
||||
vsi_size_t out_c = tmp_outputs[0]->attr.dim_num > 2 ? tmp_outputs[0]->attr.size[2] : 1;
|
||||
vsi_size_t out_n = tmp_outputs[0]->attr.dim_num > 3 ? tmp_outputs[0]->attr.size[3] : 1;
|
||||
if ((in1_w == 1) && (in1_h % 4 == 0) && (in1_c == 1) && (in1_n == 1) &&
|
||||
(out_w == 1) && (out_h % 4 == 0) && (out_c == 1) && (out_n == 1))
|
||||
{
|
||||
final_shape[0] = in1_h;
|
||||
final_shape[1] = in1_w;
|
||||
final_rank = 2;
|
||||
rs_in_tensors = vsi_nn_reshape_tensor(graph, tmp_inputs[1], final_shape, final_rank);
|
||||
final_in_tensors[1] = rs_in_tensors;
|
||||
|
||||
final_shape[0] = out_h;
|
||||
final_shape[1] = out_w;
|
||||
final_rank = 2;
|
||||
rs_out_tensors = vsi_nn_reshape_tensor(graph, tmp_outputs[0], final_shape, final_rank);
|
||||
final_out_tensors[0] = rs_out_tensors;
|
||||
|
||||
flag_4x = 1;
|
||||
}
|
||||
}
|
||||
|
||||
status = _query_kernel(kernel, tmp_inputs, tmp_outputs, depth, flag_4x, transFlg, cross_flg);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -447,7 +599,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
size_t param_num = cross_flg == 2 ? _MATRIXMUL_MERGE_PARAM_NUM : _MATRIXMUL_PARAM_NUM;
|
||||
/* Pass parameters to node. */
|
||||
vsi_nn_kernel_node_pack_io( node_params, param_num,
|
||||
inputs, 2, outputs, 1 );
|
||||
final_in_tensors, 2, final_out_tensors, 1 );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N );
|
||||
|
|
@ -483,6 +635,14 @@ static vsi_nn_kernel_node_t _setup
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_safe_release_tensor(tmp_inputs[0]);
|
||||
vsi_safe_release_tensor(tmp_inputs[1]);
|
||||
vsi_safe_release_tensor(tmp_outputs[0]);
|
||||
vsi_safe_release_tensor(rs_in_tensors);
|
||||
vsi_safe_release_tensor(rs_out_tensors);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,8 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -114,6 +115,7 @@ static const _kernel_map_type moments_map[] =
|
|||
TENSOR_MOMENTS_TWO_AXIS_KERNELS(F32, F32, 0, 1, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS(I32, F32, 0, 1, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,F32, 0, 1, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, F32, 1, 2, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F32, 0, 1, 2, KERNEL_SOURCE_5)
|
||||
TENSOR_MOMENTS_THREE_AXIS_KERNELS(F32, F32, 0, 1, 2, KERNEL_SOURCE_5)
|
||||
TENSOR_MOMENTS_THREE_AXIS_KERNELS(I32, F32, 0, 1, 2, KERNEL_SOURCE_5)
|
||||
|
|
@ -140,63 +142,6 @@ static vx_param_description_t _moments_kernel_param_def[] =
|
|||
};
|
||||
#define _MOMENTS_PARAM_NUM _cnt_of_array( _moments_kernel_param_def )
|
||||
|
||||
static int32_t set_constant_border
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
int32_t value
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.S32 = value;
|
||||
border.constant_value.U32 = (vx_uint32)value;
|
||||
border.constant_value.S16 = (vx_int16)value;
|
||||
border.constant_value.U8 = (vx_uint8)value;
|
||||
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
return status;
|
||||
}
|
||||
|
||||
static int32_t get_moments_output_reshape_size
|
||||
(
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
|
||||
int32_t* axis,
|
||||
int32_t axis_num
|
||||
)
|
||||
{
|
||||
uint32_t out_dims_num = outputs[0]->attr.dim_num;
|
||||
vsi_size_t *output_size = outputs[0]->attr.size;
|
||||
uint32_t i = 0;
|
||||
int32_t out_rs_flg = 0;
|
||||
|
||||
for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
|
||||
{
|
||||
sizes[i] = 1;
|
||||
}
|
||||
sizes[3] = out_dims_num > 3 ? output_size[3] : 1;
|
||||
|
||||
if (axis_num == 1 && axis[0] == 0)
|
||||
{
|
||||
sizes[0] = output_size[1];
|
||||
sizes[1] = out_dims_num > 2 ? output_size[2] : 1;
|
||||
out_rs_flg = 1;
|
||||
}
|
||||
else if (axis_num == 1 && axis[0] == 1)
|
||||
{
|
||||
sizes[0] = output_size[0];
|
||||
sizes[1] = out_dims_num > 2 ? output_size[2] : 1;
|
||||
out_rs_flg = 1;
|
||||
}
|
||||
else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1)
|
||||
{
|
||||
sizes[0] = out_dims_num > 2 ? output_size[2] : 1;
|
||||
out_rs_flg = 1;
|
||||
}
|
||||
|
||||
return out_rs_flg;
|
||||
} /* _get_moments_tensor_reshape_size */
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
|
|
@ -247,26 +192,39 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
gpu_param.global_size[0] = gpu_align_p2((height + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = chn;
|
||||
gpu_param.global_size[2] = 1;
|
||||
}
|
||||
else if (axis_num == 1 && axis == 1)
|
||||
{
|
||||
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = chn;
|
||||
gpu_param.global_size[2] = 1;
|
||||
}
|
||||
else if (axis_num == 1 && axis == 2)
|
||||
{
|
||||
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = height;
|
||||
gpu_param.global_size[2] = 1;
|
||||
}
|
||||
else if (axis_num == 2)
|
||||
else if (axis_num == 2 && axis == 0)
|
||||
{
|
||||
gpu_param.local_size[0] = 16;
|
||||
gpu_param.local_size[1] = 1;
|
||||
gpu_param.local_size[2] = 1;
|
||||
gpu_param.global_size[0] = 16;
|
||||
gpu_param.global_size[1] = chn;
|
||||
gpu_param.global_size[2] = 1;
|
||||
}
|
||||
else if (axis_num == 2 && axis == 1)
|
||||
{
|
||||
gpu_param.local_size[0] = 8;
|
||||
gpu_param.local_size[1] = 8;
|
||||
gpu_param.local_size[2] = 1;
|
||||
gpu_param.global_size[0] = 8;
|
||||
gpu_param.global_size[1] = 8;
|
||||
gpu_param.global_size[2] = width;
|
||||
}
|
||||
else if (axis_num == 3)
|
||||
{
|
||||
|
|
@ -275,8 +233,8 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
gpu_param.local_size[2] = 1;
|
||||
gpu_param.global_size[0] = 16;
|
||||
gpu_param.global_size[1] = 1;
|
||||
gpu_param.global_size[2] = 1;
|
||||
}
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
|
@ -366,117 +324,78 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_size_t out_shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t out_rs_flg = 0;
|
||||
int32_t axis_num = 0;
|
||||
size_t axis_num_temp = 0;
|
||||
int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num_temp);
|
||||
int32_t keep_dim = vsi_nn_kernel_param_get_int32( params, "keep_dim" );
|
||||
size_t axis_num = 0;
|
||||
int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num);
|
||||
int32_t first_axis = axis[0];
|
||||
int32_t i = 0;
|
||||
uint32_t i = 0;
|
||||
vsi_nn_kernel_scalar_t scalar_list[INTERNAL_MOMENTS_SCALAR_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t reshape_tensors[3] = { NULL };
|
||||
|
||||
vsi_size_t width = inputs[0]->attr.size[0];
|
||||
vsi_size_t height = inputs[0]->attr.size[1];
|
||||
vsi_size_t chn = inputs[0]->attr.size[2];
|
||||
uint32_t axis_size = 0;
|
||||
uint32_t rank_in = 0;
|
||||
uint32_t rank_out = 0;
|
||||
vsi_bool ret = FALSE;
|
||||
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 1, 1, 1, 1 } };
|
||||
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
|
||||
int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float dim_ratio = (float)1.0 / (float)(width * height);
|
||||
float dim_ratio = 1;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
axis_num = (int32_t)axis_num_temp;
|
||||
ret = vsi_nn_kernel_optimize_reduce_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num,
|
||||
axis, (vsi_size_t)axis_num,
|
||||
outputs[0]->attr.size, outputs[0]->attr.dim_num,
|
||||
shapes[0], &rank_in, shapes[1], &rank_out,
|
||||
new_axis, &axis_size);
|
||||
|
||||
if (axis_num == 1 && axis[0] == 0)
|
||||
{
|
||||
dim_ratio = (float)1.0 / (float)(width);
|
||||
}
|
||||
else if (axis_num == 1 && axis[0] == 1)
|
||||
{
|
||||
dim_ratio = (float)1.0 / (float)(height);
|
||||
}
|
||||
else if (axis_num == 1 && axis[0] == 2)
|
||||
{
|
||||
dim_ratio = (float)1.0 / (float)(chn);
|
||||
}
|
||||
else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1)
|
||||
{
|
||||
dim_ratio = (float)1.0 / (float)(width * height);
|
||||
}
|
||||
else if (axis_num == 3)
|
||||
{
|
||||
dim_ratio = (float)1.0 / (float)(width * height * chn);
|
||||
}
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
if ( ret == FALSE || axis_size > 3 || (axis_size == 3 && new_axis[0] != 0))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (keep_dim)
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], rank_in );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[1], rank_out );
|
||||
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[1], shapes[1], rank_out );
|
||||
|
||||
first_axis = new_axis[0];
|
||||
|
||||
for ( i = 0; i < axis_size; i++ )
|
||||
{
|
||||
out_rs_flg = get_moments_output_reshape_size(&outputs[0], out_shape, axis, axis_num);
|
||||
dim_ratio = dim_ratio / (float)(shapes[0][new_axis[i]]);
|
||||
}
|
||||
|
||||
if (inputs[0]->attr.dim_num < 2)
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( shapes[0], rank_in) )
|
||||
{
|
||||
shape[0] = inputs[0]->attr.size[0];
|
||||
shape[1] = 1;
|
||||
reshape_tensors[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 2 );
|
||||
}
|
||||
if (outputs[0]->attr.dim_num < 2)
|
||||
{
|
||||
shape[0] = outputs[0]->attr.size[0];
|
||||
shape[1] = 1;
|
||||
reshape_tensors[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 2 );
|
||||
reshape_tensors[2] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, shape, 2 );
|
||||
return NULL;
|
||||
}
|
||||
|
||||
scalar_list[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &first_axis );
|
||||
scalar_list[AXIS_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
|
||||
scalar_list[AXIS_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &axis_size );
|
||||
scalar_list[ZP] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
|
||||
scalar_list[SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
|
||||
scalar_list[WIDTH] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
scalar_list[HEIGHT] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
scalar_list[CHN] = vsi_nn_kernel_scalar_create( graph, I32, &chn );
|
||||
scalar_list[WIDTH] = vsi_nn_kernel_scalar_create( graph, I32, &shapes[0][0] );
|
||||
scalar_list[HEIGHT] = vsi_nn_kernel_scalar_create( graph, I32, &shapes[0][1] );
|
||||
scalar_list[CHN] = vsi_nn_kernel_scalar_create( graph, I32, &shapes[0][2] );
|
||||
scalar_list[DIMRATIO] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel, params, axis, axis_num, 0 );
|
||||
status = _query_kernel( inputs, outputs, kernel, params, new_axis, axis_size, 0 );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 0;
|
||||
int32_t constant_value = vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
vx_border_t border;
|
||||
/* Pass parameters to node. */
|
||||
if (reshape_tensors[0])
|
||||
{
|
||||
node_params[index++] = reshape_tensors[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t);
|
||||
}
|
||||
if (out_rs_flg)
|
||||
{
|
||||
node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, out_shape, 4 );
|
||||
node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, out_shape, 4 );
|
||||
}
|
||||
else if (reshape_tensors[1])
|
||||
{
|
||||
node_params[index++] = reshape_tensors[1];
|
||||
node_params[index++] = reshape_tensors[2];
|
||||
}
|
||||
else
|
||||
{
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[0]->t);
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[1]->t);
|
||||
}
|
||||
node_params[index++] = reshape_tensors[0]->t;
|
||||
node_params[index++] = reshape_tensors[1]->t;
|
||||
node_params[index++] = reshape_tensors[2]->t;
|
||||
|
||||
node_params[index++] = scalar_list[AXIS];
|
||||
node_params[index++] = scalar_list[AXIS_NUM];
|
||||
node_params[index++] = scalar_list[ZP];
|
||||
|
|
@ -487,29 +406,19 @@ static vsi_nn_kernel_node_t _setup
|
|||
node_params[index++] = scalar_list[DIMRATIO];
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _MOMENTS_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
if (out_rs_flg)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &node_params[1] );
|
||||
vsi_nn_kernel_tensor_release( &node_params[2] );
|
||||
}
|
||||
|
||||
status = set_constant_border(node, constant_value);
|
||||
// Set default border mode.
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
|
||||
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
CHECK_STATUS(status);
|
||||
}
|
||||
}
|
||||
|
||||
if (reshape_tensors[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &reshape_tensors[0] );
|
||||
}
|
||||
if (reshape_tensors[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &reshape_tensors[1] );
|
||||
}
|
||||
if (reshape_tensors[2])
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &reshape_tensors[2] );
|
||||
}
|
||||
vsi_safe_release_tensor(reshape_tensors[0]);
|
||||
vsi_safe_release_tensor(reshape_tensors[1]);
|
||||
vsi_safe_release_tensor(reshape_tensors[2]);
|
||||
|
||||
/* Pass parameters to node. */
|
||||
for( i = 0; i < INTERNAL_MOMENTS_SCALAR_NUM; i ++ )
|
||||
{
|
||||
|
|
|
|||
|
|
@ -0,0 +1,318 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
|
||||
#define _MAXPOOL_KERNEL_SOURCE_NAME "maxpool"
|
||||
|
||||
typedef enum
|
||||
{
|
||||
_error = -1,
|
||||
_MAX = 0,
|
||||
_AVG
|
||||
} vsi_nn_pool_type_e;
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define POOL_HASH_KEY( IN_DTYPE0, OUT_DTYPE, POOL_DTYPE ) \
|
||||
(( IN_DTYPE0 << 16 ) | ( OUT_DTYPE << 8 ) | ( POOL_DTYPE ))
|
||||
#define MAXPOOL_KERNELS( IN_DTYPE0, OUT_DTYPE ) \
|
||||
{ POOL_HASH_KEY( IN_DTYPE0, OUT_DTYPE, _MAX ), \
|
||||
CVIVANTE_NAMESPACE("cl.maxpool_"#IN_DTYPE0"to"#OUT_DTYPE), \
|
||||
_MAXPOOL_KERNEL_SOURCE_NAME },
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
MAXPOOL_KERNELS( I32, I32 )
|
||||
MAXPOOL_KERNELS( U32, U32 )
|
||||
MAXPOOL_KERNELS( F32, F32 )
|
||||
MAXPOOL_KERNELS( U32, F32 )
|
||||
MAXPOOL_KERNELS( F32, U32 )
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
|
||||
static vx_param_description_t _maxpool_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _MAXPOOL_PARAM_NUM _cnt_of_array( _maxpool_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_maxpool_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_tensor output = (vx_tensor)param[1];
|
||||
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
|
||||
vsi_size_array_t *output_shape = NULL;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
output_shape = output_attr->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.global_size[0] = (output_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0];
|
||||
gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1];
|
||||
gpu_param.global_size[2] = (output_shape->data[2] + gpu_param.global_scale[2] - 1)
|
||||
/ gpu_param.global_scale[2];
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
if (output_attr)
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release(&output_attr);
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _maxpool_initializer() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t pool_type
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e out_dtype = U8;
|
||||
uint32_t key = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (in_dtype == U8)
|
||||
{
|
||||
in_dtype = U32;
|
||||
}
|
||||
else if (in_dtype == F16)
|
||||
{
|
||||
in_dtype = F32;
|
||||
}
|
||||
else if (in_dtype == I8 || in_dtype == I16)
|
||||
{
|
||||
in_dtype = I32;
|
||||
}
|
||||
|
||||
if (out_dtype == U8)
|
||||
{
|
||||
out_dtype = U32;
|
||||
}
|
||||
else if (out_dtype == F16)
|
||||
{
|
||||
out_dtype = F32;
|
||||
}
|
||||
else if (out_dtype == I8 || out_dtype == I16)
|
||||
{
|
||||
out_dtype = I32;
|
||||
}
|
||||
|
||||
key = POOL_HASH_KEY( in_dtype, out_dtype, pool_type );
|
||||
|
||||
for ( i = 0; i < (uint32_t)_cnt_of_array(kernel_map); i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)_cnt_of_array(kernel_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = _maxpool_kernel_param_def;
|
||||
kernel->info.numParams = (uint32_t)_cnt_of_array(_maxpool_kernel_param_def);
|
||||
kernel->info.initialize = _maxpool_initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_MAXPOOL_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t pool_type = vsi_nn_kernel_param_get_int32( params, "pool_type" );
|
||||
int32_t pool_size_x = vsi_nn_kernel_param_get_int32( params, "pool_size_x" );
|
||||
int32_t pool_size_y = vsi_nn_kernel_param_get_int32( params, "pool_size_y" );
|
||||
int32_t pool_pad_x_left = vsi_nn_kernel_param_get_int32( params, "pool_pad_x_left" );
|
||||
int32_t pool_pad_y_top = vsi_nn_kernel_param_get_int32( params, "pool_pad_y_top" );
|
||||
int32_t stride_x = vsi_nn_kernel_param_get_int32( params, "stride_x" );
|
||||
int32_t stride_y = vsi_nn_kernel_param_get_int32( params, "stride_y" );
|
||||
int32_t dilation_x = vsi_nn_kernel_param_get_int32( params, "dilation_x" );
|
||||
int32_t dilation_y = vsi_nn_kernel_param_get_int32( params, "dilation_y" );
|
||||
int32_t kernel_dia_x = pool_size_x * dilation_x;
|
||||
int32_t kernel_dia_y = pool_size_y * dilation_y;
|
||||
float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float inout_scale = input_scale / output_scale;
|
||||
float inout_tail = output_zp - input_zp * inout_scale;
|
||||
int32_t width = (int32_t)inputs[0]->attr.size[0];
|
||||
int32_t height = (int32_t)inputs[0]->attr.size[1];
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num )
|
||||
|| !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, pool_type );
|
||||
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
uint32_t index = 2;
|
||||
vsi_nn_kernel_node_pack_io( node_params, _MAXPOOL_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_x_left );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_y_top );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inout_scale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inout_tail );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOL_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[11] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[12] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[13] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( pool, _setup )
|
||||
|
||||
|
|
@ -590,7 +590,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
ret = vsi_nn_kernel_optimize_element_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);
|
||||
|
||||
if ( ret )
|
||||
if ( !ret )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -349,7 +349,7 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
|
|||
input1Scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input1Scale = attr[1]->asymm.scale;
|
||||
input1Tail = 0 - attr[1]->asymm.zero_point * input1Scale;
|
||||
|
|
|
|||
|
|
@ -866,21 +866,13 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
if (axis < 0)
|
||||
{
|
||||
axis_new = 0;
|
||||
shapes[0][0] = 1;
|
||||
shapes[0][1] = 1;
|
||||
for (i = 0; i < inputs[0]->attr.dim_num; i++)
|
||||
{
|
||||
shapes[0][0] *= inputs[0]->attr.size[i];
|
||||
}
|
||||
rs_dim = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
vsi_nn_kernel_optimize_softmax_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
||||
shapes[0], &rs_dim, &axis_new);
|
||||
axis += (int32_t)inputs[0]->attr.dim_num;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_optimize_softmax_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
||||
shapes[0], &rs_dim, &axis_new);
|
||||
|
||||
if (rs_dim > 3)
|
||||
{
|
||||
return NULL;
|
||||
|
|
|
|||
|
|
@ -250,7 +250,8 @@ static vsi_status get_gather_tensor_reshape_size
|
|||
sizes[0] = block_size;
|
||||
sizes[1] = elementCnt / block_size;
|
||||
sizes[2] = outerCnt;
|
||||
if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH)
|
||||
if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH ||
|
||||
block_size > VSI_NN_MAX_IMAGE_WIDTH)
|
||||
{
|
||||
arrayFlg[0] = 1;
|
||||
}
|
||||
|
|
@ -490,6 +491,8 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
|
|||
float src0Scale = 1;
|
||||
int32_t dstZP = 0;
|
||||
float dstScale = 1;
|
||||
int32_t remainder = 0;
|
||||
int32_t width = 0;
|
||||
|
||||
uint32_t pack_key = 0;
|
||||
|
||||
|
|
@ -546,6 +549,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
|
|||
indices_num *= (int32_t)(input1_shape->data[i]);
|
||||
}
|
||||
batch = (int32_t)(input1_shape->data[input_dims1 - 1]);
|
||||
width = (int32_t)(input1_shape->data[0]);
|
||||
|
||||
shaderParam.global_scale[0] = 4;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
|
|
@ -562,6 +566,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
|
|||
(IN0_TYPE | (OUT_TYPE << 8))
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype);
|
||||
remainder = indices_num % 4;
|
||||
|
||||
{
|
||||
uint16_t M0 = 0;
|
||||
|
|
@ -656,6 +661,8 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
|
|||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
OnError:
|
||||
|
|
@ -763,20 +770,36 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
|
||||
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
|
||||
int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" );
|
||||
int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" );
|
||||
int32_t block_size = 1;
|
||||
int32_t block_num = 1;
|
||||
int32_t axis_num = 0;
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
|
||||
int32_t axis0_flg = 0;
|
||||
int32_t is_array = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
|
||||
int32_t is_array = 0;
|
||||
int32_t is_batch = batch_dims > 0 ? 1 : 0;
|
||||
vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3;
|
||||
int32_t i = 0;
|
||||
vsi_size_t *input_size = inputs[0]->attr.size;
|
||||
uint32_t i = 0;
|
||||
uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
for (i = 0; i < (uint32_t)axis; ++i)
|
||||
{
|
||||
block_size *= (int32_t)input_size[i];
|
||||
}
|
||||
|
||||
axis_num = (int32_t)input_size[axis];
|
||||
|
||||
for (i = axis + 1; i < r_rank - batch_dims; ++i)
|
||||
{
|
||||
block_num *= (int32_t)input_size[i];
|
||||
}
|
||||
|
||||
is_array = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
|
||||
|
||||
if (axis == 0)
|
||||
{
|
||||
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, batch_dims, 0, &is_array);
|
||||
|
|
|
|||
|
|
@ -47,11 +47,10 @@ typedef enum
|
|||
} _internal_kernel_e;
|
||||
|
||||
#define _GRUCELL_ACTIVATION_KERNEL_SOURCE "grucell_activation"
|
||||
#define _GRUCELL_ACTIVATION_KERNEL_NAME CVIVANTE_NAMESPACE("evis.grucell_activation")
|
||||
|
||||
#define _CDNN_KERNEL_SOURCE0 "grucell_cdnn_activation"
|
||||
#define _CDNN_KERNEL_SOURCE1 "grucell_cdnn_activation_u8"
|
||||
#define _GRUCELL_ACTIVATION_CDNN_KERNEL_NAME CVIVANTE_NAMESPACE("evis.grucell_cdnn_activation")
|
||||
#define _KERNEL_SOURCE2 "grucell_cdnn_activation_bf16"
|
||||
|
||||
typedef enum _batch_fisrt_layerout_e
|
||||
{
|
||||
|
|
@ -114,6 +113,11 @@ static const _kernel_map_type _grucell_activation_kernel_map[] =
|
|||
PACK_KERNEL_MAP( U8, U8, U8, U8, hsigmoid, VSI_NN_ACT_TANH, CN),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, F16, hsigmoid, VSI_NN_ACT_TANH, CN),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, U8, hsigmoid, VSI_NN_ACT_TANH, CN),
|
||||
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, sigmoid, VSI_NN_ACT_TANH, NC),
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, sigmoid, VSI_NN_ACT_TANH, CN),
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, hsigmoid, VSI_NN_ACT_TANH, NC),
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, hsigmoid, VSI_NN_ACT_TANH, CN),
|
||||
};
|
||||
|
||||
static const _kernel_map_type _grucell_cunn_sep_activation_kernel_map[] =
|
||||
|
|
@ -130,6 +134,12 @@ static const _kernel_map_type _grucell_cunn_sep_activation_kernel_map[] =
|
|||
PACK_KERNEL_CDNN_SEP_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE1 ),
|
||||
|
||||
PACK_KERNEL_CDNN_SEP_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN_FULL, _CDNN_KERNEL_SOURCE1 ),
|
||||
|
||||
PACK_KERNEL_CDNN_SEP_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _KERNEL_SOURCE2 ),
|
||||
|
||||
PACK_KERNEL_CDNN_SEP_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _KERNEL_SOURCE2 ),
|
||||
|
||||
PACK_KERNEL_CDNN_SEP_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN_FULL, _KERNEL_SOURCE2 ),
|
||||
};
|
||||
|
||||
static const _kernel_map_type _grucell_cunn_activation_kernel_map[] =
|
||||
|
|
@ -142,6 +152,10 @@ static const _kernel_map_type _grucell_cunn_activation_kernel_map[] =
|
|||
PACK_KERNEL_CDNN_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _CDNN_KERNEL_SOURCE1 ),
|
||||
|
||||
PACK_KERNEL_CDNN_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE1 ),
|
||||
|
||||
PACK_KERNEL_CDNN_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _KERNEL_SOURCE2 ),
|
||||
|
||||
PACK_KERNEL_CDNN_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _KERNEL_SOURCE2 ),
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -322,6 +336,37 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
|
|||
"tensorScale", &tensorScale );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "tensorZP", &tensorZP );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "tensorScale", &tensorScale );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -604,6 +649,34 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_cdnn_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -80,9 +80,11 @@ typedef struct
|
|||
|
||||
static const _kernel_map_type _grucell_activation_sma_kernel_map[] =
|
||||
{
|
||||
PACK_KERNEL_MAP(F16, F16, F16, F16),
|
||||
PACK_KERNEL_MAP(F16, F16, F16, F16),
|
||||
PACK_KERNEL_MAP(BF16, BF16, BF16, BF16),
|
||||
|
||||
PACK_KERNEL_MAP_2D(F16, F16, F16, F16),
|
||||
PACK_KERNEL_MAP_2D(F16, F16, F16, F16),
|
||||
PACK_KERNEL_MAP_2D(BF16, BF16, BF16, BF16),
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -200,6 +202,45 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_A_GRUCELL_ACTIVATION_SMA_KEY(BF16, BF16, BF16, BF16):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x05050404, 0x07070606, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,14 +68,16 @@ typedef struct
|
|||
static const _kernel_map_type _grucell_activation_z_h_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( U8, F16, U8, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( U8, F16, U8, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, I8, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, I16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( U8, F16, U8, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( U8, F16, U8, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, I8, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, I16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, HSIGMOID ),
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -218,6 +220,34 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(BF16, BF16, BF16):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(U8, F16, U8):
|
||||
case _PACK_SELECT_KEY(I8, F16, I8):
|
||||
case _PACK_SELECT_KEY(I16, F16, I16):
|
||||
|
|
|
|||
|
|
@ -67,14 +67,16 @@ typedef struct
|
|||
static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( U8, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( U8, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( U8, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( U8, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, HSIGMOID ),
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -194,6 +196,34 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(BF16, BF16, BF16):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(U8, F16, F16):
|
||||
case _PACK_SELECT_KEY(I8, F16, F16):
|
||||
case _PACK_SELECT_KEY(I16, F16, F16):
|
||||
|
|
|
|||
|
|
@ -70,14 +70,16 @@ typedef struct
|
|||
static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, TANH ),
|
||||
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, TANH ),
|
||||
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, TANH ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, TANH ),
|
||||
PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ),
|
||||
PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, TANH ),
|
||||
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, TANH ),
|
||||
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, TANH ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, TANH ),
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, TANH ),
|
||||
PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ),
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, SIGMOID ),
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -224,6 +226,34 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(BF16, BF16, BF16):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(U8, F16, U8):
|
||||
case _PACK_SELECT_KEY(I8, F16, I8):
|
||||
case _PACK_SELECT_KEY(I16, F16, I16):
|
||||
|
|
|
|||
|
|
@ -439,6 +439,32 @@ static const _kernel_map_type _lstmunit_activation_kernel_map[] =
|
|||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I16, F16, HARD_SIGMOID, SP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I8, F16, HARD_SIGMOID, SP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, U8, F16, HARD_SIGMOID, SP)
|
||||
|
||||
/* BF16 type */
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, BF16, BF16, BF16, SIGMOID, CLP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, BF16, BF16, BF16, SIGMOID, LP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, BF16, BF16, BF16, SIGMOID, CL)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, BF16, BF16, BF16, SIGMOID, L)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, BF16, BF16, BF16, SIGMOID, BP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, BF16, BF16, BF16, SIGMOID, B)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, BF16, BF16, BF16, SIGMOID, CBP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, BF16, BF16, BF16, SIGMOID, CB)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, BF16, BF16, BF16, SIGMOID, SP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, BF16, BF16, BF16, SIGMOID, S)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, BF16, BF16, BF16, SIGMOID, CSP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, BF16, BF16, BF16, SIGMOID, CS)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CLP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, LP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CL)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, L)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, BP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, B)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, CBP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, CB)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, SP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, S)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CSP)
|
||||
GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CS)
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -1135,6 +1161,26 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
|
|||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
if (dstQuantType == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
|
|
@ -1152,31 +1198,41 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
|
|||
|
||||
if ( cellFormat == F16 )
|
||||
{
|
||||
vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4);
|
||||
}
|
||||
|
||||
if ( dstFormat == F16 )
|
||||
{
|
||||
vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
|
||||
}
|
||||
else if ( dstFormat != BF16 )
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
|
||||
}
|
||||
|
||||
if ( cellFormat == BF16 && dstFormat == BF16)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
|
||||
}
|
||||
else
|
||||
{
|
||||
vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4);
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
vsi_nn_kernel_gpu_add_param(node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4);
|
||||
vsi_nn_kernel_gpu_add_param(node, "logE", &logE);
|
||||
vsi_nn_kernel_gpu_add_param(node, "twoLogE", &twoLogE);
|
||||
vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale);
|
||||
vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP);
|
||||
vsi_nn_kernel_gpu_add_param(node, "forget_bias", &forget_bias);
|
||||
vsi_nn_kernel_gpu_add_param(node, "clip_Min_F", clip_Min_F);
|
||||
vsi_nn_kernel_gpu_add_param(node, "clip_Max_F", clip_Max_F);
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "logE", &logE);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "twoLogE", &twoLogE);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "forget_bias", &forget_bias);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "clip_Min_F", clip_Min_F);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "clip_Max_F", clip_Max_F);
|
||||
if ( !_is_ln && input_attr[S_INPUT_FC_F]->dtype == F16 )
|
||||
{
|
||||
vsi_nn_kernel_gpu_add_param(node, "uniFp16AddFp16toFp32_4x4", &uniFp16AddFp16toFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16AddFp16toFp32_4x4", &uniFp16AddFp16toFp32_4x4);
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
if (input_attr[S_INPUT_FC_F]->dtype == U8 &&
|
||||
input_attr[S_INPUT_FC_F]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
|
|
@ -1380,8 +1436,8 @@ static vsi_status _query_kernel
|
|||
vx_param_description_t * param_def = NULL;
|
||||
size_t param_def_size = _LSTMUNIT_ACTIVATION_MAX_PARAM_NUM;
|
||||
vx_kernel_initialize_f initializer = _lstmunit_activation_initializer;
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
uint32_t key = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
set_vx_param_description_t( lstm_activation, ¶m_def );
|
||||
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@
|
|||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -1576,21 +1577,22 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_param_t tmp_params[_MATRIX_MUL_CROSS_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
|
||||
vsi_nn_tensor_t* tmp_inputs[2] = {NULL};
|
||||
vsi_nn_tensor_t* tmp_outputs[1] = {NULL};
|
||||
int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" );
|
||||
int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" );
|
||||
int32_t adjointA = vsi_nn_kernel_param_get_int32( params, "adjointA" );
|
||||
int32_t adjointB = vsi_nn_kernel_param_get_int32( params, "adjointB" );
|
||||
uint32_t cross_flg = vsi_nn_kernel_param_get_int32( params, "cross_flg" );
|
||||
size_t tmp_size = 0;
|
||||
uint32_t* size_axis_in_out = NULL;
|
||||
uint32_t* stride_axis_in_out = NULL;
|
||||
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
uint32_t new_rank[3] = {0};
|
||||
vsi_size_t M = inputs[0]->attr.size[1];
|
||||
vsi_size_t K = inputs[0]->attr.size[0];
|
||||
vsi_size_t N = inputs[1]->attr.size[0];
|
||||
vsi_size_t depthA = 1, depthB = 1;
|
||||
|
||||
size_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "size_axis_inner_outer", &tmp_size);
|
||||
stride_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "stride_axis_inner_outer", &tmp_size);
|
||||
uint32_t cross_flg = 0;
|
||||
uint32_t size_axis_in_out[3] = {0};
|
||||
uint32_t stride_axis_in_out[9] = {0};
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
|
@ -1609,35 +1611,62 @@ static vsi_nn_kernel_node_t _setup
|
|||
return NULL;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_optimize_matrixmul_broadcast_shape(
|
||||
inputs[0]->attr.size,
|
||||
inputs[1]->attr.size,
|
||||
outputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num,
|
||||
inputs[1]->attr.dim_num,
|
||||
outputs[0]->attr.dim_num,
|
||||
shapes[0], shapes[1], shapes[2], new_rank,
|
||||
&cross_flg, size_axis_in_out, stride_axis_in_out);
|
||||
if (status)
|
||||
{
|
||||
tmp_inputs[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], new_rank[0]);
|
||||
tmp_inputs[1] = vsi_nn_reshape_tensor(graph, inputs[1], shapes[1], new_rank[1]);
|
||||
tmp_outputs[0] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[2], new_rank[2]);
|
||||
|
||||
M = tmp_inputs[0]->attr.size[1];
|
||||
K = tmp_inputs[0]->attr.size[0];
|
||||
N = tmp_inputs[1]->attr.size[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
VSILOGE("illegal inputs shape");
|
||||
status = VSI_FAILURE;
|
||||
goto final;
|
||||
}
|
||||
|
||||
if (transposeA)
|
||||
{
|
||||
K = inputs[0]->attr.size[1];
|
||||
M = inputs[0]->attr.size[0];
|
||||
K = tmp_inputs[0]->attr.size[1];
|
||||
M = tmp_inputs[0]->attr.size[0];
|
||||
}
|
||||
else if (transposeB)
|
||||
{
|
||||
N = inputs[1]->attr.size[1];
|
||||
N = tmp_inputs[1]->attr.size[1];
|
||||
}
|
||||
|
||||
depthA = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
|
||||
depthB = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1;
|
||||
depthA = tmp_inputs[0]->attr.dim_num > 2 ? tmp_inputs[0]->attr.size[2] : 1;
|
||||
depthB = tmp_inputs[1]->attr.dim_num > 2 ? tmp_inputs[1]->attr.size[2] : 1;
|
||||
|
||||
if (M == 1 && depthB == 1 && depthA > 1)
|
||||
{
|
||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
shape[0] = inputs[0]->attr.size[0];
|
||||
shape[1] = inputs[0]->attr.size[2];
|
||||
shape[0] = tmp_inputs[0]->attr.size[0];
|
||||
shape[1] = tmp_inputs[0]->attr.size[2];
|
||||
shape[2] = 1;
|
||||
shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
|
||||
shape[3] = tmp_inputs[0]->attr.dim_num > 3 ? tmp_inputs[0]->attr.size[3] : 1;
|
||||
rs_input = vsi_nn_kernel_tensor_reshape( tmp_inputs[0]->t, shape, 4 );
|
||||
|
||||
shape[0] = outputs[0]->attr.size[0];
|
||||
shape[1] = outputs[0]->attr.size[2];
|
||||
shape[0] = tmp_outputs[0]->attr.size[0];
|
||||
shape[1] = tmp_outputs[0]->attr.size[2];
|
||||
shape[2] = 1;
|
||||
shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
|
||||
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
|
||||
shape[3] = tmp_outputs[0]->attr.dim_num > 3 ? tmp_outputs[0]->attr.size[3] : 1;
|
||||
rs_output = vsi_nn_kernel_tensor_reshape( tmp_outputs[0]->t, shape, 4 );
|
||||
}
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel, transposeA, transposeB, cross_flg );
|
||||
status = _query_kernel( tmp_inputs, tmp_outputs, kernel, transposeA, transposeB, cross_flg );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -1649,13 +1678,13 @@ static vsi_nn_kernel_node_t _setup
|
|||
if (rs_input)
|
||||
{
|
||||
tmp_params[0] = rs_input;
|
||||
tmp_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t);
|
||||
tmp_params[1] = (vsi_nn_kernel_node_param_t)(tmp_inputs[1]->t);
|
||||
tmp_params[2] = rs_output;
|
||||
}
|
||||
else
|
||||
{
|
||||
vsi_nn_kernel_node_pack_io( tmp_params, param_num,
|
||||
inputs, 2, outputs, 1 );
|
||||
tmp_inputs, 2, tmp_outputs, 1 );
|
||||
}
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeA );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeB );
|
||||
|
|
@ -1725,6 +1754,10 @@ static vsi_nn_kernel_node_t _setup
|
|||
}
|
||||
}
|
||||
}
|
||||
final:
|
||||
vsi_safe_release_tensor( tmp_inputs[0] );
|
||||
vsi_safe_release_tensor( tmp_inputs[1] );
|
||||
vsi_safe_release_tensor( tmp_outputs[0] );
|
||||
if (rs_input)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input );
|
||||
|
|
|
|||
|
|
@ -0,0 +1,374 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define KERNEL_SOURCE_0 "maxpool",
|
||||
|
||||
typedef enum
|
||||
{
|
||||
_error = -1,
|
||||
_MAX = 0,
|
||||
_AVG
|
||||
} vsi_nn_pool_type_e;
|
||||
|
||||
#define HASH_POOL_KEY(_input_type, _output_type, _pool_type, _image_2d) \
|
||||
((_input_type << 24) | (_output_type << 16) | (_pool_type << 8) | (_image_2d))
|
||||
|
||||
#define HASH_MAXPOOL_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.maxpool_"#SRC_TYPE"to"#DST_TYPE)
|
||||
|
||||
#define MAXPOOL_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_POOL_KEY(IN0_TYPE, OUT_TYPE, _MAX, 0), \
|
||||
HASH_MAXPOOL_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
const char* source_name;
|
||||
} kernel_map[] =
|
||||
{
|
||||
MAXPOOL_KERNELS(F16, F16, KERNEL_SOURCE_0)
|
||||
MAXPOOL_KERNELS(BF16, BF16, KERNEL_SOURCE_0)
|
||||
MAXPOOL_KERNELS(I8, I8, KERNEL_SOURCE_0)
|
||||
MAXPOOL_KERNELS(U8, U8, KERNEL_SOURCE_0)
|
||||
MAXPOOL_KERNELS(I16, I16, KERNEL_SOURCE_0)
|
||||
MAXPOOL_KERNELS(U8, F16, KERNEL_SOURCE_0)
|
||||
MAXPOOL_KERNELS(I8, F16, KERNEL_SOURCE_0)
|
||||
MAXPOOL_KERNELS(I16, F16, KERNEL_SOURCE_0)
|
||||
MAXPOOL_KERNELS(F16, I8, KERNEL_SOURCE_0)
|
||||
MAXPOOL_KERNELS(F16, U8, KERNEL_SOURCE_0)
|
||||
MAXPOOL_KERNELS(F16, I16, KERNEL_SOURCE_0)
|
||||
};
|
||||
|
||||
static vx_param_description_t kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
|
||||
};
|
||||
#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def)
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_maxpool_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
float input_zp = 0.0f;
|
||||
float input_scale = 1.0f;
|
||||
float output_zp = 0;
|
||||
float output_scale = 1.0f;
|
||||
float inout_scale = 1.0f;
|
||||
float inout_tail = 0.0f;
|
||||
int32_t width = 0;
|
||||
int32_t height = 0;
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
uint32_t pack_key = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
|
||||
width = (int32_t)attr[0]->shape->data[0];
|
||||
height = (int32_t)attr[0]->shape->data[1];
|
||||
|
||||
input_scale = attr[0]->scale;
|
||||
input_zp = (float)attr[0]->zero_point;
|
||||
output_scale = attr[1]->scale;
|
||||
output_zp = (float)attr[1]->zero_point;
|
||||
|
||||
inout_scale = input_scale / output_scale;
|
||||
inout_tail = output_zp - input_zp * inout_scale;
|
||||
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \
|
||||
(IN0_TYPE | ( OUT_TYPE << 16))
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype );
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0];
|
||||
gpu_param.global_size[1] = (
|
||||
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvF16toFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inout_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "inout_tail", &inout_tail );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "width", &width );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "height", &height );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
switch( pack_key )
|
||||
{
|
||||
case _PACK_SELECT_KEY( I8, I8 ):
|
||||
case _PACK_SELECT_KEY( U8, U8 ):
|
||||
case _PACK_SELECT_KEY( I16, I16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, I8 ):
|
||||
case _PACK_SELECT_KEY( F16, U8 ):
|
||||
case _PACK_SELECT_KEY( F16, I16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvF16toFp32_4x4", &uniConvF16toFp32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( BF16, BF16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#undef _PACK_SELECT_KEY
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _maxpool_initializer() */
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
int32_t pool_type,
|
||||
vsi_nn_kernel_t* kernel
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
uint32_t key = 0;
|
||||
size_t i = 0;
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
key = HASH_POOL_KEY( input0_dtype, output_dtype, pool_type, 0 );
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(kernel_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( kernel_param_def );
|
||||
kernel->info.initialize = _maxpool_initializer;
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t tmp_params[_EVIS_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t pool_type = vsi_nn_kernel_param_get_int32( params, "pool_type" );
|
||||
int32_t pool_size_x = vsi_nn_kernel_param_get_int32( params, "pool_size_x" );
|
||||
int32_t pool_size_y = vsi_nn_kernel_param_get_int32( params, "pool_size_y" );
|
||||
int32_t pool_pad_x_left = vsi_nn_kernel_param_get_int32( params, "pool_pad_x_left" );
|
||||
int32_t pool_pad_y_top = vsi_nn_kernel_param_get_int32( params, "pool_pad_y_top" );
|
||||
int32_t stride_x = vsi_nn_kernel_param_get_int32( params, "stride_x" );
|
||||
int32_t stride_y = vsi_nn_kernel_param_get_int32( params, "stride_y" );
|
||||
int32_t dilation_x = vsi_nn_kernel_param_get_int32( params, "dilation_x" );
|
||||
int32_t dilation_y = vsi_nn_kernel_param_get_int32( params, "dilation_y" );
|
||||
int32_t kernel_dia_x = pool_size_x * dilation_x;
|
||||
int32_t kernel_dia_y = pool_size_y * dilation_y;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(params);
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( inputs, outputs, pool_type, kernel );
|
||||
if ( VSI_SUCCESS == status )
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 2;
|
||||
/* Pass parameters to node. */
|
||||
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
|
||||
inputs, 1, outputs, 1 );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_x_left );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_y_top );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_x );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_y );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_x );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_y );
|
||||
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[9] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( pool, _setup )
|
||||
|
|
@ -117,6 +117,17 @@ static vx_param_description_t vxPreProcessNv12Kernel_param_def[] =
|
|||
};
|
||||
#define _EVIS_PRE_PROCESS_NV12_PARAM_NUM _cnt_of_array(vxPreProcessNv12Kernel_param_def)
|
||||
|
||||
static vsi_bool _check_nv12_type_from_env()
|
||||
{
|
||||
vsi_bool ret = FALSE;
|
||||
char* env_s = vsi_nn_getenv("VSI_NN_ENABLE_OCV_NV12");
|
||||
if (env_s)
|
||||
{
|
||||
ret = TRUE;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
|
|
@ -145,6 +156,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
|||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
vsi_bool ocv_nv12 = _check_nv12_type_from_env();
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
|
|
@ -208,7 +220,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
|||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertNV12toB_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
|
|
@ -239,6 +250,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
|||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
|
||||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractYtoShortSub16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
|
||||
0x99999999, // TCfg
|
||||
0x44444444, // ASelt
|
||||
|
|
@ -259,6 +281,61 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
|||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000,
|
||||
0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000,
|
||||
0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (ocv_nv12)
|
||||
{
|
||||
uniConvertNV12toB_4x4.data[2] = 0x00010000;
|
||||
uniConvertNV12toB_4x4.data[3] = 0x00230022;
|
||||
uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
|
||||
|
||||
uniConvertNV12toG_4x4.data[2] = 0x01010100;
|
||||
uniConvertNV12toG_4x4.data[3] = 0x03230322;
|
||||
uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[9] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[11] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[13] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[15] = 0x00003a81;
|
||||
|
||||
uniConvertNV12toR_4x4.data[2] = 0x00110010;
|
||||
uniConvertNV12toR_4x4.data[3] = 0x00330032;
|
||||
uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
|
||||
|
||||
uniExtractUVtoCharSub128_2x8.data[2] = 0x03020100;
|
||||
uniExtractUVtoCharSub128_2x8.data[3] = 0x07060504;
|
||||
|
||||
uniExtractYtoShortSub16_2x8.data[0] = 0x99999999;
|
||||
uniExtractYtoShortSub16_2x8.data[1] = 0x44444444;
|
||||
uniExtractYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
|
||||
uniExtractYtoShortSub16_2x8.data[8] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[9] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[10] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[11] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[12] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[13] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[14] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[15] = 0x00010001;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
|
||||
|
|
@ -266,12 +343,15 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_2x8", &uniExtractYtoShortSub16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
switch( attr[0]->dtype )
|
||||
{
|
||||
case U8:
|
||||
|
|
@ -335,6 +415,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
|
|||
float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
|
||||
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
|
||||
float resize = 0.0f;
|
||||
vsi_bool ocv_nv12 = _check_nv12_type_from_env();
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
|
@ -445,6 +526,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
|
|||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
|
||||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertYtoShortSub16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
|
|
@ -487,11 +579,64 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
|
|||
0x00000000, 0x00010000, 0x00000000, 0x00010000,
|
||||
0x00000000, 0x00010000, 0x00000000, 0x00010000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000,
|
||||
0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000,
|
||||
0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (ocv_nv12)
|
||||
{
|
||||
uniConvertNV12toB_4x4.data[2] = 0x00010000;
|
||||
uniConvertNV12toB_4x4.data[3] = 0x00230022;
|
||||
uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
|
||||
|
||||
uniConvertNV12toG_4x4.data[2] = 0x01010100;
|
||||
uniConvertNV12toG_4x4.data[3] = 0x03230322;
|
||||
uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[9] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[11] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[13] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[15] = 0x00003a81;
|
||||
|
||||
uniConvertNV12toR_4x4.data[2] = 0x00110010;
|
||||
uniConvertNV12toR_4x4.data[3] = 0x00330032;
|
||||
uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
|
||||
|
||||
uniConvertYtoShortSub16_2x8.data[0] = 0x99999999;
|
||||
uniConvertYtoShortSub16_2x8.data[1] = 0x44444444;
|
||||
uniConvertYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
|
||||
uniConvertYtoShortSub16_2x8.data[8] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[9] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[10] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[11] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[12] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[13] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[14] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[15] = 0x00010001;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYtoShortSub16_2x8", &uniConvertYtoShortSub16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
|
||||
|
|
@ -506,6 +651,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8);
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
|
||||
|
|
|
|||
|
|
@ -249,6 +249,18 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
|
|||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000,
|
||||
0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000,
|
||||
0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4);
|
||||
|
|
@ -262,6 +274,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
switch( attr[0]->dtype )
|
||||
{
|
||||
case U8:
|
||||
|
|
@ -461,6 +475,18 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
|
|||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000,
|
||||
0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000,
|
||||
0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4);
|
||||
|
|
@ -477,6 +503,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_4x4", &uniExtractYtoShortSub16_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
switch( attr[0]->dtype )
|
||||
|
|
|
|||
|
|
@ -664,9 +664,15 @@ static vsi_nn_kernel_node_t _setup
|
|||
hashkeys[1] = BILINEAR_NHWC_BOUND_HASH_KEY( in_dtype, out_dtype, (vsi_size_t)up_scale );
|
||||
|
||||
status = _query_kernel( ikernels[0], hashkeys[0], 0);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
if (status != VSI_SUCCESS)
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
status = _query_kernel( kernel, hashkeys[1], 1);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
if (status != VSI_SUCCESS)
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
shapes[0][0] = depth * inputs[0]->attr.size[1];
|
||||
shapes[0][1] = inputs[0]->attr.size[2];
|
||||
|
|
|
|||
|
|
@ -532,10 +532,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
|
|||
width = (width + 15) / 16;
|
||||
}
|
||||
|
||||
input0_zp = attr[0]->asymm.zero_point;
|
||||
input0_scale = attr[0]->asymm.scale;
|
||||
output_zp = attr[1]->asymm.zero_point;
|
||||
output_scale = 1.0f / attr[1]->asymm.scale;
|
||||
input0_zp = attr[0]->zero_point;
|
||||
input0_scale = attr[0]->scale;
|
||||
output_zp = attr[1]->zero_point;
|
||||
output_scale = 1.0f / attr[1]->scale;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
|
|
@ -670,10 +670,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
|
|||
update_width = (int32_t)(attr[1]->shape->data[0]);
|
||||
index_num = (int32_t)(attr[0]->shape->data[1]);
|
||||
|
||||
input1_zp = attr[1]->asymm.zero_point;
|
||||
input1_scale = attr[1]->asymm.scale;
|
||||
output_zp = attr[2]->asymm.zero_point;
|
||||
output_scale = 1.0f / attr[2]->asymm.scale;
|
||||
input1_zp = attr[1]->zero_point;
|
||||
input1_scale = attr[1]->scale;
|
||||
output_zp = attr[2]->zero_point;
|
||||
output_scale = 1.0f / attr[2]->scale;
|
||||
|
||||
if (coord_dim == 5)
|
||||
{
|
||||
|
|
@ -916,10 +916,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer)
|
|||
}
|
||||
width = element_size / 8;
|
||||
|
||||
input_zp0 = attr[0]->asymm.zero_point;
|
||||
input_scale0 = attr[0]->asymm.scale;
|
||||
output_zp = attr[1]->asymm.zero_point;
|
||||
output_scale = attr[1]->asymm.scale;
|
||||
input_zp0 = attr[0]->zero_point;
|
||||
input_scale0 = attr[0]->scale;
|
||||
output_zp = attr[1]->zero_point;
|
||||
output_scale = attr[1]->scale;
|
||||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
|
||||
{
|
||||
|
|
@ -933,9 +933,14 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer)
|
|||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
if (element_size < 8)
|
||||
{
|
||||
gpu_param.global_size[0] = element_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_size[0] = width;
|
||||
}
|
||||
gpu_param.global_size[1] = 1;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
|
|
@ -1006,7 +1011,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
|
|||
int32_t coord_dim = 0;
|
||||
int32_t strides[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t coord_strides[8] = {0};
|
||||
int32_t *coord_strides1 = coord_strides + 4;
|
||||
int32_t coord_strides1[4] = {0};
|
||||
int32_t input2_zp = 0;
|
||||
int32_t i = 0;
|
||||
|
||||
|
|
@ -1046,13 +1051,14 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
|
|||
width = block_size / 4;
|
||||
}
|
||||
|
||||
input2_zp = attr[1]->asymm.zero_point;
|
||||
input2_zp = attr[1]->zero_point;
|
||||
|
||||
coord_strides[coord_dim - 1] = 1;
|
||||
for (i = 0; i < coord_dim - 1; i++)
|
||||
{
|
||||
coord_strides[i] = strides[coord_dim - 2 - i];
|
||||
}
|
||||
memcpy(coord_strides1, coord_strides + 4, 4 * sizeof(int32_t));
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
|
|
@ -1165,7 +1171,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
|
|||
int32_t coord_dim = 0;
|
||||
int32_t strides[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t coord_strides[8] = {0};
|
||||
int32_t *coord_strides1 = coord_strides + 4;
|
||||
int32_t coord_strides1[4] = {0};
|
||||
float output_zp = 0;
|
||||
float input_scale = 1.0f;
|
||||
float output_scale = 1.0f;
|
||||
|
|
@ -1202,9 +1208,9 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
|
|||
update_width = (int32_t)(attr[1]->shape->data[0]);
|
||||
index_num = (int32_t)(attr[0]->shape->data[1]);
|
||||
|
||||
input_scale = attr[1]->asymm.scale;
|
||||
output_scale = attr[2]->asymm.scale;
|
||||
output_zp = (float)attr[2]->asymm.zero_point;
|
||||
input_scale = attr[1]->scale;
|
||||
output_scale = attr[2]->scale;
|
||||
output_zp = (float)attr[2]->zero_point;
|
||||
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
|
||||
{
|
||||
input_scale = 1.0f;
|
||||
|
|
@ -1220,6 +1226,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
|
|||
{
|
||||
coord_strides[i] = strides[coord_dim - 2 - i];
|
||||
}
|
||||
memcpy(coord_strides1, coord_strides + 4, 4 * sizeof(int32_t));
|
||||
|
||||
width = block_size;
|
||||
if (block_size % 4 == 0)
|
||||
|
|
@ -1337,9 +1344,14 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer)
|
|||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
if (element_size < 8)
|
||||
{
|
||||
gpu_param.global_size[0] = element_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_size[0] = width;
|
||||
}
|
||||
gpu_param.global_size[1] = 1;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
|
|
|
|||
|
|
@ -479,6 +479,7 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
|
|||
vsi_size_t* temp_shape_y = NULL;
|
||||
vsi_size_t* temp_shape_output = NULL;
|
||||
vsi_size_t temp_rank = 0;
|
||||
vsi_bool exceed_maxsize = FALSE;
|
||||
|
||||
#define _swap_size(a, b, tmp) \
|
||||
{ \
|
||||
|
|
@ -490,6 +491,27 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
|
|||
VSI_UNREFERENCED(rank_x);
|
||||
VSI_UNREFERENCED(rank);
|
||||
|
||||
for (i = 0; i < rank_output; i++)
|
||||
{
|
||||
if (shape_output[i] > GPU_TENSOR_MAX_WIDTH)
|
||||
{
|
||||
exceed_maxsize = TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
if (exceed_maxsize)
|
||||
{
|
||||
for (i = 0; i < rank_output; i++)
|
||||
{
|
||||
out_shape_x[i] = shape_x[i];
|
||||
out_shape_y[i] = multiples[i];
|
||||
out_shape_output[i] = shape_output[i];
|
||||
}
|
||||
*out_rank_output = rank_output;
|
||||
ret = TRUE;
|
||||
goto final;
|
||||
}
|
||||
|
||||
temp_shape_x = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t));
|
||||
if (temp_shape_x == NULL)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -156,5 +156,17 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(inverse_sigmoid)
|
|||
#if (VX_TENSOR_SELECT_VX_SUPPORT)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(select)
|
||||
#endif
|
||||
#if (VX_TENSOR_POW_API_SUPPORT)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(pow)
|
||||
#endif
|
||||
#if (VX_TENSOR_GATHER_API_SUPPORT)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(gather)
|
||||
#endif
|
||||
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops)
|
||||
#endif
|
||||
#if (VX_TENSOR_TILE_API_SUPPORT)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(tile)
|
||||
#endif
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -0,0 +1,82 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
#if (VX_TENSOR_GATHER_API_SUPPORT)
|
||||
|
||||
#define REGISTER_GATHEROPENVX_KERNEL( kernel_name ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
); \
|
||||
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
)
|
||||
|
||||
REGISTER_GATHEROPENVX_KERNEL( gather )
|
||||
{
|
||||
vx_node node = NULL;
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
int32_t batch_dims = vsi_nn_kernel_param_get_int32(params, "batch_dims");
|
||||
|
||||
VSI_UNREFERENCED(kernel);
|
||||
VSI_UNREFERENCED(params);
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
node = vxTensorGatherNode(graph->g,
|
||||
inputs[0]->t,
|
||||
inputs[1]->t,
|
||||
axis,
|
||||
batch_dims,
|
||||
outputs[0]->t
|
||||
);
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* gather() */
|
||||
|
||||
#undef REGISTER_GATHEROPENVX_KERNEL
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
#if (VX_TENSOR_POW_API_SUPPORT)
|
||||
|
||||
#define REGISTER_POWOPENVX_KERNEL( kernel_name ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
); \
|
||||
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
)
|
||||
|
||||
REGISTER_POWOPENVX_KERNEL( pow )
|
||||
{
|
||||
vx_node node = vxTensorPowNode( graph->g, inputs[0]->t, inputs[1]->t,
|
||||
outputs[0]->t );
|
||||
|
||||
VSI_UNREFERENCED(kernel);
|
||||
VSI_UNREFERENCED(params);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(input_num);
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* pow() */
|
||||
|
||||
#undef REGISTER_POWOPENVX_KERNEL
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,83 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
||||
|
||||
#define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
); \
|
||||
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
)
|
||||
|
||||
REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( relational_ops )
|
||||
{
|
||||
vx_node node = NULL;
|
||||
|
||||
int32_t operation = vsi_nn_kernel_param_get_int32(params, "operation");
|
||||
vx_tensor inputs_tensor[2] = {NULL};
|
||||
|
||||
inputs_tensor[0] = inputs[0]->t;
|
||||
inputs_tensor[1] = inputs[1]->t;
|
||||
|
||||
VSI_UNREFERENCED(kernel);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
node = vxRelationalLayer(graph->g,
|
||||
operation,
|
||||
inputs_tensor,
|
||||
(uint32_t)input_num,
|
||||
outputs[0]->t
|
||||
);
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* relational_ops() */
|
||||
|
||||
#undef REGISTER_RELATIONAL_OPS_OPENVX_KERNEL
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
#if (VX_TENSOR_TILE_API_SUPPORT)
|
||||
|
||||
#define REGISTER_TILEOPENVX_KERNEL( kernel_name ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
); \
|
||||
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
)
|
||||
|
||||
REGISTER_TILEOPENVX_KERNEL( tile )
|
||||
{
|
||||
vx_node node = NULL;
|
||||
|
||||
VSI_UNREFERENCED(kernel);
|
||||
VSI_UNREFERENCED(params);
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
node = vxTensorTileNode(graph->g,
|
||||
inputs[0]->t,
|
||||
inputs[1]->t,
|
||||
outputs[0]->t
|
||||
);
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* tile() */
|
||||
|
||||
#undef REGISTER_TILEOPENVX_KERNEL
|
||||
|
||||
#endif
|
||||
|
|
@ -88,6 +88,8 @@ __kernel void cumsum_##name##toU8_axis2( \
|
|||
\
|
||||
src_type sum = (src_type)(0); \
|
||||
uint4 dst = (uint4)(0); \
|
||||
int tmp_zp = convert_int_rte(output_zp); \
|
||||
dst.x = convert_uint_sat(tmp_zp); \
|
||||
\
|
||||
float cnt = 0.0f; \
|
||||
\
|
||||
|
|
@ -252,6 +254,8 @@ __kernel void cumsum_##name##toU8_axis1( \
|
|||
\
|
||||
src_type sum = (src_type)(0); \
|
||||
uint4 dst = (uint4)(0); \
|
||||
int tmp_zp = convert_int_rte(output_zp); \
|
||||
dst.x = convert_uint_sat(tmp_zp); \
|
||||
\
|
||||
float cnt = 0; \
|
||||
\
|
||||
|
|
@ -416,6 +420,8 @@ __kernel void cumsum_##name##toU8_axis0( \
|
|||
\
|
||||
src_type sum = (src_type)(0); \
|
||||
uint4 dst = (uint4)(0); \
|
||||
int tmp_zp = convert_int_rte(output_zp); \
|
||||
dst.x = convert_uint_sat(tmp_zp); \
|
||||
\
|
||||
float cnt = 0; \
|
||||
\
|
||||
|
|
@ -487,4 +493,4 @@ __kernel void cumsum_##name##toU8_axis0( \
|
|||
} \
|
||||
}
|
||||
CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)
|
||||
CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)
|
||||
CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)
|
||||
|
|
|
|||
|
|
@ -85,12 +85,15 @@ __kernel void cumsum_U8toU8_axis1_2D(
|
|||
uint4 sum = (uint4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
|
||||
int tmp_zp = convert_int_rte(output_zp);
|
||||
dst.x = convert_uint_sat(tmp_zp);
|
||||
|
||||
float cnt = 0;
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord.w = height - 1;
|
||||
write_imageui(output, coord.zw, sum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(coord.y = height - 1; coord.y > 0; coord.y--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
|
|
@ -107,7 +110,7 @@ __kernel void cumsum_U8toU8_axis1_2D(
|
|||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
write_imageui(output, coord.zw, sum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(coord.y = 0; coord.y < height - 1; coord.y++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
|
|
@ -173,6 +176,8 @@ __kernel void cumsum_F32toU8_axis1_2D(
|
|||
|
||||
float4 sum = (float4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
int tmp_zp = convert_int_rte(output_zp);
|
||||
dst.x = convert_uint_sat(tmp_zp);
|
||||
|
||||
float cnt = 0;
|
||||
|
||||
|
|
@ -331,13 +336,16 @@ __kernel void cumsum_U8toU8_axis0_2D(
|
|||
uint4 sum = (uint4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
|
||||
int tmp_zp = convert_int_rte(output_zp);
|
||||
dst.x = convert_uint_sat(tmp_zp);
|
||||
|
||||
float cnt = 0.0f;
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord.x = width - 1;
|
||||
coord.z = coord.x;
|
||||
write_imageui(output, coord.zw, sum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(; coord.x > 0; coord.x--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
|
|
@ -355,7 +363,7 @@ __kernel void cumsum_U8toU8_axis0_2D(
|
|||
else if(exclusive)
|
||||
{
|
||||
coord.z = 0;
|
||||
write_imageui(output, coord.zw, sum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(coord.x = 0; coord.x < width - 1; coord.x++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
|
|
@ -421,9 +429,10 @@ __kernel void cumsum_F32toU8_axis0_2D(
|
|||
|
||||
float4 sum = (float4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
int tmp_zp = convert_int_rte(output_zp);
|
||||
dst.x = convert_uint_sat(tmp_zp);
|
||||
|
||||
float cnt = 0.0f;
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord.x = width - 1;
|
||||
|
|
@ -491,4 +500,4 @@ __kernel void cumsum_F32toU8_axis0_2D(
|
|||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,6 @@
|
|||
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
|
||||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
#define rlogE (0.693147182f)
|
||||
float LOG(float x)
|
||||
{
|
||||
|
|
@ -5,16 +8,11 @@ float LOG(float x)
|
|||
return x * rlogE;
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis0_F32toF32
|
||||
(
|
||||
__kernel void log_softmax_axis0_F32toF32(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float beta,
|
||||
float scale,
|
||||
float scaleOut,
|
||||
float zpOut
|
||||
)
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
|
@ -58,16 +56,11 @@ __kernel void log_softmax_axis0_F32toF32
|
|||
}
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis0_F32toF32_2D
|
||||
(
|
||||
__kernel void log_softmax_axis0_F32toF32_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
float beta,
|
||||
float scale,
|
||||
float scaleOut,
|
||||
float zpOut
|
||||
)
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
|
@ -110,16 +103,11 @@ __kernel void log_softmax_axis0_F32toF32_2D
|
|||
}
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis0_U8toU8
|
||||
(
|
||||
__kernel void log_softmax_axis0_U8toU8(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float beta,
|
||||
float scale,
|
||||
float scaleOut,
|
||||
float zpOut
|
||||
)
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
|
@ -165,16 +153,11 @@ __kernel void log_softmax_axis0_U8toU8
|
|||
}
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis0_U8toU8_2D
|
||||
(
|
||||
__kernel void log_softmax_axis0_U8toU8_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
float beta,
|
||||
float scale,
|
||||
float scaleOut,
|
||||
float zpOut
|
||||
)
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
|
@ -217,4 +200,109 @@ __kernel void log_softmax_axis0_U8toU8_2D
|
|||
coord_in.x++;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis0_BF16toBF16(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int z = get_global_id(2);
|
||||
int width = get_image_width(input);
|
||||
int4 coord_in = (int4)(0, y, z, 0);
|
||||
float4 maxValue, src, dst = {0.0};
|
||||
uint4 data, val, out;
|
||||
|
||||
data = read_imageui(input, coord_in);
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, maxValue, data, 16);
|
||||
for (coord_in.x = 1; coord_in.x < width; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
coord_in.x++;
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
maxValue = maxValue > src ? maxValue : src;
|
||||
}
|
||||
|
||||
float sum = 0.f;
|
||||
for (coord_in.x = 0; coord_in.x < width; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
coord_in.x++;
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
sum += exp2((src.x - maxValue.x) * scale);
|
||||
}
|
||||
|
||||
float logSum = LOG(sum);
|
||||
for (coord_in.x = 0; coord_in.x < width; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
dst.x = (src.x - maxValue.x) * beta - logSum;
|
||||
_viv_asm(COPY, val, dst, 16);
|
||||
out = val >> 16;
|
||||
write_imageui(output, coord_in, out);
|
||||
coord_in.x++;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis0_BF16toBF16_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int width = get_image_width(input);
|
||||
int2 coord_in = (int2)(0, y);
|
||||
float4 maxValue, src, dst = {0.0};
|
||||
uint4 data, val, out;
|
||||
|
||||
data = read_imageui(input, coord_in);
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, maxValue, data, 16);
|
||||
for (coord_in.x = 1; coord_in.x < width; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
coord_in.x++;
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
maxValue = maxValue > src ? maxValue : src;
|
||||
}
|
||||
|
||||
float sum = 0.0f;
|
||||
for (coord_in.x = 0; coord_in.x < width; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
coord_in.x++;
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
sum += exp2((src.x - maxValue.x) * scale);
|
||||
}
|
||||
|
||||
float logSum = LOG(sum);
|
||||
for (coord_in.x = 0; coord_in.x < width; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
dst.x = (src.x - maxValue.x) * beta - logSum;
|
||||
_viv_asm(COPY, val, dst, 16);
|
||||
out = val >> 16;
|
||||
write_imageui(output, coord_in, out);
|
||||
coord_in.x++;
|
||||
}
|
||||
}
|
||||
#undef rlogE
|
||||
|
|
|
|||
|
|
@ -1,3 +1,6 @@
|
|||
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
|
||||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
#define rlogE (0.693147182f)
|
||||
|
||||
float LOG(float x)
|
||||
|
|
@ -6,16 +9,11 @@ float LOG(float x)
|
|||
return x * rlogE;
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis1_F32toF32
|
||||
(
|
||||
__kernel void log_softmax_axis1_F32toF32(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float beta,
|
||||
float scale,
|
||||
float scaleOut,
|
||||
float zpOut
|
||||
)
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
|
@ -59,16 +57,11 @@ __kernel void log_softmax_axis1_F32toF32
|
|||
}
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis1_F32toF32_2D
|
||||
(
|
||||
__kernel void log_softmax_axis1_F32toF32_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
float beta,
|
||||
float scale,
|
||||
float scaleOut,
|
||||
float zpOut
|
||||
)
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
|
@ -111,16 +104,11 @@ __kernel void log_softmax_axis1_F32toF32_2D
|
|||
}
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis1_U8toU8
|
||||
(
|
||||
__kernel void log_softmax_axis1_U8toU8(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float beta,
|
||||
float scale,
|
||||
float scaleOut,
|
||||
float zpOut
|
||||
)
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
|
@ -166,16 +154,11 @@ __kernel void log_softmax_axis1_U8toU8
|
|||
}
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis1_U8toU8_2D
|
||||
(
|
||||
__kernel void log_softmax_axis1_U8toU8_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
float beta,
|
||||
float scale,
|
||||
float scaleOut,
|
||||
float zpOut
|
||||
)
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
|
@ -218,4 +201,111 @@ __kernel void log_softmax_axis1_U8toU8_2D
|
|||
coord_in.y++;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis1_BF16oBF16(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int z = get_global_id(2);
|
||||
int height = get_image_height(input);
|
||||
int4 coord_in = (int4)(x, 0, z, 0);
|
||||
float4 maxValue, src, dst = {0.0};
|
||||
uint4 data, val, out;
|
||||
|
||||
data = read_imageui(input, coord_in);
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, maxValue, data, 16);
|
||||
for (coord_in.y = 1; coord_in.y < height; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
coord_in.y++;
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
maxValue = maxValue > src ? maxValue : src;
|
||||
}
|
||||
|
||||
float sum = 0.f;
|
||||
for (coord_in.y = 0; coord_in.y < height; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
coord_in.y++;
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
sum += exp2((src.x - maxValue.x) * scale);
|
||||
}
|
||||
|
||||
float logSum = LOG(sum);
|
||||
for (coord_in.y = 0; coord_in.y < height; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
dst.x = (src.x - maxValue.x) * beta - logSum;
|
||||
|
||||
_viv_asm(COPY, val, dst, 16);
|
||||
out = val >> 16;
|
||||
|
||||
write_imageui(output, coord_in, out);
|
||||
coord_in.y++;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis1_BF16toBF16_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis, float beta,
|
||||
float scale, float scaleOut, float zpOut)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int height = get_image_height(input);
|
||||
int2 coord_in = (int2)(x, 0);
|
||||
float4 maxValue, src, dst = {0.0};
|
||||
uint4 data, val, out;
|
||||
|
||||
data = read_imageui(input, coord_in);
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, maxValue, data, 16);
|
||||
for (coord_in.y = 1; coord_in.y < height; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
coord_in.y++;
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
maxValue = maxValue > src ? maxValue : src;
|
||||
}
|
||||
|
||||
float sum = 0.0f;
|
||||
for (coord_in.y = 0; coord_in.y < height; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
coord_in.y++;
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
sum += exp2((src.x - maxValue.x) * scale);
|
||||
}
|
||||
|
||||
float logSum = 1.0f * LOG(sum);
|
||||
for (coord_in.y = 0; coord_in.y < height; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
dst.x = (src.x - maxValue.x) * beta - logSum;
|
||||
_viv_asm(COPY, val, dst, 16);
|
||||
out = val >> 16;
|
||||
write_imageui(output, coord_in, out);
|
||||
coord_in.y++;
|
||||
}
|
||||
}
|
||||
#undef rlogE
|
||||
|
|
|
|||
|
|
@ -1,3 +1,6 @@
|
|||
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
|
||||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
#define rlogE (0.693147182f)
|
||||
float LOG(float x)
|
||||
{
|
||||
|
|
@ -112,4 +115,68 @@ __kernel void log_softmax_axis2_U8toU8
|
|||
coord_in.z++;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void log_softmax_axis2_BF16toBF16
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float beta,
|
||||
float scale,
|
||||
float scaleOut,
|
||||
float zpOut
|
||||
)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int z = get_global_id(2);
|
||||
int depth = get_image_array_size(input);
|
||||
int4 coord_in = (int4)(x, y, 0, 0);
|
||||
float4 maxValue;
|
||||
float4 src, dst = {0.0};
|
||||
uint4 data, val, out;
|
||||
|
||||
// Find max element value which we'll use to ensure numerical stability
|
||||
// taking advantage of the following equality:
|
||||
// exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
|
||||
data = read_imageui(input, coord_in);
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, maxValue, data, 16);
|
||||
for (coord_in.z = 1; coord_in.z < depth; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
coord_in.z++;
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
maxValue = maxValue > src ? maxValue : src;
|
||||
}
|
||||
|
||||
// Compute sum.
|
||||
float sum = 0.f;
|
||||
for (coord_in.z = 0; coord_in.z < depth; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
coord_in.z++;
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
sum += exp2((src.x - maxValue.x) * scale);
|
||||
}
|
||||
|
||||
// Compute result.
|
||||
float logSum = LOG(sum);
|
||||
for (coord_in.z = 0; coord_in.z < depth; )
|
||||
{
|
||||
data = read_imageui(input, coord_in);
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, src, data, 16);
|
||||
|
||||
dst.x = (src.x - maxValue.x) * beta - logSum;
|
||||
_viv_asm(COPY, val, dst, 16);
|
||||
out = val >> 16;
|
||||
write_imageui(output, coord_in, out);
|
||||
coord_in.z++;
|
||||
}
|
||||
}
|
||||
#undef rlogE
|
||||
|
|
|
|||
|
|
@ -0,0 +1,127 @@
|
|||
#pragma OPENCL EXTENSION CL_VIV_asm : enable
|
||||
|
||||
__kernel void gemm_4x_F32F32toF32_2D(
|
||||
__read_only image2d_t inputA,
|
||||
__read_only image2d_t inputB,
|
||||
__write_only image2d_t output,
|
||||
int M,
|
||||
int K,
|
||||
int N,
|
||||
int ac2zero,
|
||||
int bc2zero,
|
||||
float scale_a,
|
||||
float zp_a,
|
||||
float scale_b,
|
||||
float zp_b,
|
||||
float scale_out,
|
||||
float zp_out
|
||||
)
|
||||
{
|
||||
int offset0 = get_global_id(0) * K;
|
||||
int offset1 = offset0 + K;
|
||||
int offset2 = offset1 + K;
|
||||
int offset3 = offset2 + K;
|
||||
int out_offset = get_global_id(0);
|
||||
int z = 0;
|
||||
float4 sum = (float4)(0, 0, 0, 0);
|
||||
|
||||
Image in0_tensor = create_image_from_image2d(inputA, 4);
|
||||
__global float* in0_ptr = (__global float*)in0_tensor.ptr;
|
||||
__global float* in0_ptr0 = in0_ptr + offset0;
|
||||
__global float* in0_ptr1 = in0_ptr + offset1;
|
||||
__global float* in0_ptr2 = in0_ptr + offset2;
|
||||
__global float* in0_ptr3 = in0_ptr + offset3;
|
||||
|
||||
Image in1_tensor = create_image_from_image2d(inputB, 4);
|
||||
__global float* in1_ptr = (__global float*)in1_tensor.ptr;
|
||||
|
||||
Image o_tensor = create_image_from_image2d(output, 4);
|
||||
__global float* output_ptr = (__global float*)o_tensor.ptr + out_offset;
|
||||
|
||||
int step = K >> 2;
|
||||
for(z = 0; z < step; z++)
|
||||
{
|
||||
float4 tempA0, tempA1, tempA2, tempA3;
|
||||
float4 tempB0;
|
||||
|
||||
tempB0 = vload4(z, in1_ptr);
|
||||
tempA0 = vload4(z, in0_ptr0);
|
||||
tempA1 = vload4(z, in0_ptr1);
|
||||
tempA2 = vload4(z, in0_ptr2);
|
||||
tempA3 = vload4(z, in0_ptr3);
|
||||
|
||||
sum.x += dot(tempA0, tempB0);
|
||||
sum.y += dot(tempA1, tempB0);
|
||||
sum.z += dot(tempA2, tempB0);
|
||||
sum.w += dot(tempA3, tempB0);
|
||||
}
|
||||
|
||||
vstore4(sum, 0, output_ptr);
|
||||
|
||||
}
|
||||
|
||||
__kernel void gemm_4x_transa_F32F32toF32_2D(
|
||||
__read_only image2d_t inputA,
|
||||
__read_only image2d_t inputB,
|
||||
__write_only image2d_t output,
|
||||
int M,
|
||||
int K,
|
||||
int N,
|
||||
int ac2zero,
|
||||
int bc2zero,
|
||||
float scale_a,
|
||||
float zp_a,
|
||||
float scale_b,
|
||||
float zp_b,
|
||||
float scale_out,
|
||||
float zp_out
|
||||
)
|
||||
{
|
||||
int offset0 = get_global_id(0);
|
||||
int offset1 = M << 2;
|
||||
|
||||
int z = 0;
|
||||
float4 sum = (float4)(0, 0, 0, 0);
|
||||
|
||||
Image in0_tensor = create_image_from_image2d(inputA, 4);
|
||||
__global float* in0_ptr0 = (__global float*)in0_tensor.ptr + offset0;
|
||||
__global float* in0_ptr1 = in0_ptr0 + M;
|
||||
__global float* in0_ptr2 = in0_ptr1 + M;
|
||||
__global float* in0_ptr3 = in0_ptr2 + M;
|
||||
|
||||
Image in1_tensor = create_image_from_image2d(inputB, 4);
|
||||
__global float* in1_ptr = (__global float*)in1_tensor.ptr;
|
||||
|
||||
Image o_tensor = create_image_from_image2d(output, 4);
|
||||
__global float* output_ptr = (__global float*)o_tensor.ptr + offset0;
|
||||
|
||||
int step = K >> 2;
|
||||
for(z = 0; z < step; z++)
|
||||
{
|
||||
float4 tempA0, tempA1, tempA2, tempA3;
|
||||
float4 tempB0;
|
||||
|
||||
tempB0 = vload4(z, in1_ptr);
|
||||
tempA0 = vload4(0, in0_ptr0);
|
||||
tempA1 = vload4(0, in0_ptr1);
|
||||
tempA2 = vload4(0, in0_ptr2);
|
||||
tempA3 = vload4(0, in0_ptr3);
|
||||
|
||||
sum += tempA0 * tempB0.x;
|
||||
sum += tempA1 * tempB0.y;
|
||||
sum += tempA2 * tempB0.z;
|
||||
sum += tempA3 * tempB0.w;
|
||||
|
||||
in0_ptr0 = in0_ptr0 + offset1;
|
||||
in0_ptr1 = in0_ptr1 + offset1;
|
||||
in0_ptr2 = in0_ptr2 + offset1;
|
||||
in0_ptr3 = in0_ptr3 + offset1;
|
||||
|
||||
}
|
||||
|
||||
vstore4(sum, 0, output_ptr);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,217 @@
|
|||
#define VSI_FLOAT32_MIN (1.175494351e-38F)
|
||||
|
||||
#define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val, read_func, write_func, conv_func) \
|
||||
__kernel void maxpool_##in_name##to##out_name( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
int width, \
|
||||
int height, \
|
||||
int stride_x, \
|
||||
int stride_y, \
|
||||
int pad_x, \
|
||||
int pad_y, \
|
||||
int kernel_dia_x, \
|
||||
int kernel_dia_y, \
|
||||
int dilation_x, \
|
||||
int dilation_y, \
|
||||
float inout_scale, \
|
||||
float inout_tail) \
|
||||
{ \
|
||||
int gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
int gidz = get_global_id(2); \
|
||||
int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
|
||||
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \
|
||||
int4 coord_in = coord_out; \
|
||||
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \
|
||||
\
|
||||
for(; pos_start.x < 0;) \
|
||||
{ \
|
||||
pos_start.x += dilation_x; \
|
||||
} \
|
||||
for(; pos_start.y < 0;) \
|
||||
{ \
|
||||
pos_start.y += dilation_y; \
|
||||
} \
|
||||
\
|
||||
pos_end = min(pos_end, (int2)(width, height)); \
|
||||
\
|
||||
src_type src0, maxVal; \
|
||||
maxVal.x = max_val; \
|
||||
\
|
||||
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \
|
||||
{ \
|
||||
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \
|
||||
{ \
|
||||
src0 = read_func(input, coord_in); \
|
||||
coord_in.x += dilation_x; \
|
||||
maxVal = max(src0, maxVal); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
float4 fValTmp; \
|
||||
fValTmp.x = maxVal.x * inout_scale + inout_tail; \
|
||||
dst_type dst = conv_func(fValTmp); \
|
||||
write_func(output, coord_out, dst.xxxx); \
|
||||
}
|
||||
MAXPOOL_QINT(U32, U32, uint4, uint4, 0, read_imageui, write_imageui, convert_uint4_rte)
|
||||
MAXPOOL_QINT(I32, I32, int4, int4, -2147483648, read_imagei, write_imagei, convert_int4_rte)
|
||||
|
||||
__kernel void maxpool_F32toF32(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int width,
|
||||
int height,
|
||||
int stride_x,
|
||||
int stride_y,
|
||||
int pad_x,
|
||||
int pad_y,
|
||||
int kernel_dia_x,
|
||||
int kernel_dia_y,
|
||||
int dilation_x,
|
||||
int dilation_y,
|
||||
float inout_scale,
|
||||
float inout_tail)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
int gidz = get_global_id(2);
|
||||
int4 coord_out = (int4)(gidx, gidy, gidz, 0);
|
||||
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);
|
||||
int4 coord_in = coord_out;
|
||||
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);
|
||||
|
||||
for(; pos_start.x < 0;)
|
||||
{
|
||||
pos_start.x += dilation_x;
|
||||
}
|
||||
for(; pos_start.y < 0;)
|
||||
{
|
||||
pos_start.y += dilation_y;
|
||||
}
|
||||
|
||||
pos_end = min(pos_end, (int2)(width, height));
|
||||
|
||||
float4 src0, maxVal;
|
||||
maxVal.x = VSI_FLOAT32_MIN;
|
||||
|
||||
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)
|
||||
{
|
||||
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)
|
||||
{
|
||||
src0 = read_imagef(input, coord_in);
|
||||
coord_in.x += dilation_x;
|
||||
maxVal = max(src0, maxVal);
|
||||
}
|
||||
}
|
||||
|
||||
write_imagef(output, coord_out, maxVal.xxxx);
|
||||
}
|
||||
|
||||
__kernel void maxpool_U32toF32(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int width,
|
||||
int height,
|
||||
int stride_x,
|
||||
int stride_y,
|
||||
int pad_x,
|
||||
int pad_y,
|
||||
int kernel_dia_x,
|
||||
int kernel_dia_y,
|
||||
int dilation_x,
|
||||
int dilation_y,
|
||||
float inout_scale,
|
||||
float inout_tail)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
int gidz = get_global_id(2);
|
||||
int4 coord_out = (int4)(gidx, gidy, gidz, 0);
|
||||
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);
|
||||
int4 coord_in = coord_out;
|
||||
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);
|
||||
|
||||
for(; pos_start.x < 0;)
|
||||
{
|
||||
pos_start.x += dilation_x;
|
||||
}
|
||||
for(; pos_start.y < 0;)
|
||||
{
|
||||
pos_start.y += dilation_y;
|
||||
}
|
||||
|
||||
pos_end = min(pos_end, (int2)(width, height));
|
||||
|
||||
uint4 src0, maxVal;
|
||||
maxVal.x = 0;
|
||||
|
||||
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)
|
||||
{
|
||||
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)
|
||||
{
|
||||
src0 = read_imageui(input, coord_in);
|
||||
coord_in.x += dilation_x;
|
||||
maxVal = max(src0, maxVal);
|
||||
}
|
||||
}
|
||||
|
||||
float4 dst;
|
||||
dst.x = maxVal.x * inout_scale + inout_tail;
|
||||
|
||||
write_imagef(output, coord_out, dst.xxxx);
|
||||
}
|
||||
|
||||
__kernel void maxpool_F32toU32(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int width,
|
||||
int height,
|
||||
int stride_x,
|
||||
int stride_y,
|
||||
int pad_x,
|
||||
int pad_y,
|
||||
int kernel_dia_x,
|
||||
int kernel_dia_y,
|
||||
int dilation_x,
|
||||
int dilation_y,
|
||||
float inout_scale,
|
||||
float inout_tail)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
int gidz = get_global_id(2);
|
||||
int4 coord_out = (int4)(gidx, gidy, gidz, 0);
|
||||
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);
|
||||
int4 coord_in = coord_out;
|
||||
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);
|
||||
|
||||
for(; pos_start.x < 0;)
|
||||
{
|
||||
pos_start.x += dilation_x;
|
||||
}
|
||||
for(; pos_start.y < 0;)
|
||||
{
|
||||
pos_start.y += dilation_y;
|
||||
}
|
||||
|
||||
pos_end = min(pos_end, (int2)(width, height));
|
||||
|
||||
float4 src0, maxVal;
|
||||
maxVal.x = VSI_FLOAT32_MIN;
|
||||
|
||||
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)
|
||||
{
|
||||
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)
|
||||
{
|
||||
src0 = read_imagef(input, coord_in);
|
||||
coord_in.x += dilation_x;
|
||||
maxVal = max(src0, maxVal);
|
||||
}
|
||||
}
|
||||
|
||||
uint4 dst;
|
||||
dst.x = convert_uint_rte(maxVal.x * inout_scale + inout_tail);
|
||||
|
||||
write_imageui(output, coord_out, dst.xxxx);
|
||||
}
|
||||
|
|
@ -232,3 +232,66 @@ __kernel void moments_axis01_BF16toF32(
|
|||
write_imagef(output_vari, coord_out, vari);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(8, 8, 1))) void moments_axis12_U8toF32(
|
||||
image2d_array_t input, image2d_array_t output_mean, image2d_array_t output_vari,
|
||||
int axis, int axis_num, int input_zp, float input_scale,
|
||||
int width, int height, int chn, float dimRatio
|
||||
)
|
||||
{
|
||||
int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
int gidz = get_global_id(2); // width
|
||||
|
||||
int4 coord = (int4)(gidz, lidx, lidy, 0);
|
||||
uint4 data;
|
||||
float sum = 0, sqr = 0;
|
||||
float e2InScale = input_scale * input_scale;
|
||||
|
||||
__local uint lcl_sumSqr[128];
|
||||
__local uint lcl_sumSqr1[32];
|
||||
|
||||
uint2 tmpSumSqr = 0;
|
||||
for(coord.z = lidy; coord.z < chn; coord.z += 8)
|
||||
{
|
||||
for(coord.y = lidx; coord.y < height;)
|
||||
{
|
||||
data = read_imageui(input, coord);
|
||||
coord.y += 8;
|
||||
tmpSumSqr = tmpSumSqr + (uint2)(data.x, data.x * data.x);
|
||||
}
|
||||
//sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;
|
||||
//sum += (tmpSum - height * input_zp) * input_scale;
|
||||
}
|
||||
int index = lidx + lidy * 8;
|
||||
vstore2(tmpSumSqr, index, lcl_sumSqr);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(index < 16)
|
||||
{
|
||||
uint4 val0 = vload4(index, lcl_sumSqr);
|
||||
uint4 val1 = vload4(index, lcl_sumSqr + 64);
|
||||
val0 += val1;
|
||||
uint2 val2 = val0.xy + val0.zw;
|
||||
vstore2(val2, index, lcl_sumSqr1);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(index == 0)
|
||||
{
|
||||
uint4 val0 = 0;
|
||||
for(int i = 0; i < 8; i++)
|
||||
{
|
||||
val0 += vload4(i, lcl_sumSqr1);
|
||||
}
|
||||
|
||||
float2 tmpVal = convert_float2(val0.xy + val0.zw);
|
||||
sum = (tmpVal.x - height * chn * input_zp) * input_scale;
|
||||
sqr = (tmpVal.y - 2 * input_zp * tmpVal.x + height * chn * input_zp * input_zp) * e2InScale;
|
||||
float4 mean, vari;
|
||||
mean.x = sum * dimRatio;
|
||||
vari.x = sqr * dimRatio;
|
||||
vari.x = vari.x - mean.x * mean.x;
|
||||
|
||||
write_imagef(output_mean, coord.xwww, mean);
|
||||
write_imagef(output_vari, coord.xwww, vari);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,157 +5,6 @@
|
|||
_viv_uniform float4 matrix0;
|
||||
_viv_uniform float2 matrix1;
|
||||
_viv_uniform float4 matrix4;
|
||||
__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
float _m0,
|
||||
float _m1,
|
||||
float _m2,
|
||||
float _m3,
|
||||
float _m4,
|
||||
float _m5
|
||||
)
|
||||
{
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
|
||||
coord_in = convert_int4(coord_f);
|
||||
|
||||
vxc_uchar16 dst;
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void custom_warp_affine_bilinear_U8toU8_2D
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
float _m0,
|
||||
float _m1,
|
||||
float _m2,
|
||||
float _m3,
|
||||
float _m4,
|
||||
float _m5
|
||||
)
|
||||
{
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
|
||||
coord_in = convert_int4(coord_f);
|
||||
|
||||
vxc_uchar16 src0, src1, dst;
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void custom_warp_affine_nearest_neighbor_U8toU8
|
||||
(
|
||||
|
|
|
|||
|
|
@ -0,0 +1,158 @@
|
|||
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
|
||||
|
||||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float4 matrix0;
|
||||
_viv_uniform float2 matrix1;
|
||||
_viv_uniform float4 matrix4;
|
||||
__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
float _m0,
|
||||
float _m1,
|
||||
float _m2,
|
||||
float _m3,
|
||||
float _m4,
|
||||
float _m5
|
||||
)
|
||||
{
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
|
||||
coord_in = convert_int4(coord_f);
|
||||
|
||||
vxc_uchar16 dst;
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void custom_warp_affine_bilinear_U8toU8_2D
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
float _m0,
|
||||
float _m1,
|
||||
float _m2,
|
||||
float _m3,
|
||||
float _m4,
|
||||
float _m5
|
||||
)
|
||||
{
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
|
||||
coord_in = convert_int4(coord_f);
|
||||
|
||||
vxc_uchar16 src0, src1, dst;
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -0,0 +1,341 @@
|
|||
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
|
||||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
#define WARP_AFFINE(name) \
|
||||
__kernel void custom_warp_affine_##name \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_t matrix, \
|
||||
__write_only image2d_array_t output, \
|
||||
float _m0, \
|
||||
float _m1, \
|
||||
float _m2, \
|
||||
float _m3, \
|
||||
float _m4, \
|
||||
float _m5 \
|
||||
) \
|
||||
|
||||
#define GET_MATRIX_VALUE \
|
||||
float4 matrix0; \
|
||||
float2 matrix1; \
|
||||
float4 matrix4; \
|
||||
int2 coord_matrix = (int2)(0,0); \
|
||||
Image img1 = create_image_from_image2d(matrix, 4); \
|
||||
__global float* matrix_ptr = (__global float*)img1.ptr; \
|
||||
matrix0 = vload4(0, matrix_ptr); \
|
||||
matrix1 = vload2(2, matrix_ptr); \
|
||||
matrix4.x = matrix0.x; \
|
||||
matrix4.y = matrix0.y; \
|
||||
matrix4.z = matrix0.x * 2; \
|
||||
matrix4.w = matrix0.y * 2; \
|
||||
|
||||
WARP_AFFINE(nearest_neighbor_U8toU8_2D_optional_input)
|
||||
{
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
|
||||
GET_MATRIX_VALUE
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
|
||||
coord_in = convert_int4(coord_f);
|
||||
|
||||
vxc_uchar16 dst;
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
WARP_AFFINE(bilinear_U8toU8_2D_optional_input)
|
||||
{
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
|
||||
GET_MATRIX_VALUE
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
|
||||
coord_in = convert_int4(coord_f);
|
||||
|
||||
vxc_uchar16 src0, src1, dst;
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
WARP_AFFINE(nearest_neighbor_U8toU8_optional_input)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
|
||||
GET_MATRIX_VALUE
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
coord_in = convert_int4(coord_f);
|
||||
|
||||
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_input.w, baseAddr);
|
||||
|
||||
vxc_uchar16 dst;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
coord_input.xy = coord_in.zw;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
coord_input.xy = coord_in.xy;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
coord_input.xy = coord_in.zw;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
coord_input.xy = coord_in.xy;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
coord_input.xy = coord_in.zw;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
coord_input.xy = coord_in.xy;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
coord_input.xy = coord_in.zw;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
|
||||
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
WARP_AFFINE(bilinear_U8toU8_optional_input)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
|
||||
GET_MATRIX_VALUE
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
|
||||
coord_in = convert_int4(coord_f);
|
||||
|
||||
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_input.w, baseAddr);
|
||||
|
||||
vxc_uchar16 src0, src1, dst;
|
||||
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_input.xy = coord_in.zw;
|
||||
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
|
||||
coord_input.xy = coord_in.xy;
|
||||
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_input.xy = coord_in.zw;
|
||||
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
coord_input.xy = coord_in.xy;
|
||||
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_input.xy = coord_in.zw;
|
||||
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_f = coord_f.zwzw + matrix4;
|
||||
coord_in = convert_int4(coord_f);
|
||||
coord_input.xy = coord_in.xy;
|
||||
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_input.xy = coord_in.zw;
|
||||
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -0,0 +1,333 @@
|
|||
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
|
||||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
#define GET_MATRIX_VALUE \
|
||||
float4 matrix0; \
|
||||
float2 matrix1; \
|
||||
Image img1 = create_image_from_image2d(matrix, 4); \
|
||||
__global float* matrix_ptr = (__global float*)img1.ptr; \
|
||||
matrix0 = vload4(0, matrix_ptr); \
|
||||
matrix1 = vload2(2, matrix_ptr); \
|
||||
|
||||
__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_2D_optional_input
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_t matrix,
|
||||
__write_only image2d_array_t output,
|
||||
float _m0,
|
||||
float _m1,
|
||||
float _m2,
|
||||
float _m3,
|
||||
float _m4,
|
||||
float _m5
|
||||
)
|
||||
{
|
||||
int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
int2 coord_matrix = (int2)(0,0);
|
||||
GET_MATRIX_VALUE
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
|
||||
coord_in.x = floor(coord_f.x) * 3;
|
||||
coord_in.y = floor(coord_f.y);
|
||||
coord_in.z = floor(coord_f.z) * 3;
|
||||
coord_in.w = floor(coord_f.w);
|
||||
|
||||
vxc_uchar16 dst;
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.x = coord_in.x + 1;
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.x = coord_in.x + 1;
|
||||
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.z = coord_in.z + 1;
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.z = coord_in.z + 1;
|
||||
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void custom_warp_affine_bilinear_U8toU8_rgb_2D_optional_input
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_t matrix,
|
||||
__write_only image2d_array_t output,
|
||||
float _m0,
|
||||
float _m1,
|
||||
float _m2,
|
||||
float _m3,
|
||||
float _m4,
|
||||
float _m5
|
||||
)
|
||||
{
|
||||
int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
int2 coord_matrix = (int2)(0,0);
|
||||
GET_MATRIX_VALUE
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
|
||||
coord_in.x = floor(coord_f.x) * 3;
|
||||
coord_in.y = floor(coord_f.y);
|
||||
coord_in.z = floor(coord_f.z) * 3;
|
||||
coord_in.w = floor(coord_f.w);
|
||||
|
||||
vxc_uchar16 src0, src1, src_0, src_1, dst;
|
||||
VXC_ReadImage(src_0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src_1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
src0.x = src_0.s0;
|
||||
src0.y = src_0.s3;
|
||||
src1.x = src_1.s0;
|
||||
src1.y = src_1.s3;
|
||||
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
src0.x = src_0.s1;
|
||||
src0.y = src_0.s4;
|
||||
src1.x = src_1.s1;
|
||||
src1.y = src_1.s4;
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
src0.x = src_0.s2;
|
||||
src0.y = src_0.s5;
|
||||
src1.x = src_1.s2;
|
||||
src1.y = src_1.s5;
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_ReadImage(src_0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src_1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
src0.x = src_0.s0;
|
||||
src0.y = src_0.s3;
|
||||
src1.x = src_1.s0;
|
||||
src1.y = src_1.s3;
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
src0.x = src_0.s1;
|
||||
src0.y = src_0.s4;
|
||||
src1.x = src_1.s1;
|
||||
src1.y = src_1.s4;
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
src0.x = src_0.s2;
|
||||
src0.y = src_0.s5;
|
||||
src1.x = src_1.s2;
|
||||
src1.y = src_1.s5;
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_optional_input
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_t matrix,
|
||||
__write_only image2d_array_t output,
|
||||
float _m0,
|
||||
float _m1,
|
||||
float _m2,
|
||||
float _m3,
|
||||
float _m4,
|
||||
float _m5
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
int2 coord_matrix = (int2)(0,0);
|
||||
GET_MATRIX_VALUE
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
|
||||
coord_in.x = floor(coord_f.x) * 3;
|
||||
coord_in.y = floor(coord_f.y);
|
||||
coord_in.z = floor(coord_f.z) * 3;
|
||||
coord_in.w = floor(coord_f.w);
|
||||
|
||||
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_input.w, baseAddr);
|
||||
|
||||
vxc_uchar16 dst;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
coord_input.x = coord_input.x + 1;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord_input.x = coord_input.x + 1;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
coord_input.xy = coord_in.zw;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_input.x = coord_input.x + 1;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
coord_input.x = coord_input.x + 1;
|
||||
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void custom_warp_affine_bilinear_U8toU8_rgb_optional_input
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_t matrix,
|
||||
__write_only image2d_array_t output,
|
||||
float _m0,
|
||||
float _m1,
|
||||
float _m2,
|
||||
float _m3,
|
||||
float _m4,
|
||||
float _m5
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
|
||||
|
||||
float4 coord_f = convert_float4(coord_in);
|
||||
int2 coord_matrix = (int2)(0,0);
|
||||
GET_MATRIX_VALUE
|
||||
|
||||
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
|
||||
|
||||
coord_in.x = floor(coord_f.x) * 3;
|
||||
coord_in.y = floor(coord_f.y);
|
||||
coord_in.z = floor(coord_f.z) * 3;
|
||||
coord_in.w = floor(coord_f.w);
|
||||
|
||||
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_input.w, baseAddr);
|
||||
|
||||
vxc_uchar16 src0, src1, src_0, src_1, dst;
|
||||
VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
src0.x = src_0.s0;
|
||||
src0.y = src_0.s3;
|
||||
src1.x = src_1.s0;
|
||||
src1.y = src_1.s3;
|
||||
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
src0.x = src_0.s1;
|
||||
src0.y = src_0.s4;
|
||||
src1.x = src_1.s1;
|
||||
src1.y = src_1.s4;
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
src0.x = src_0.s2;
|
||||
src0.y = src_0.s5;
|
||||
src1.x = src_1.s2;
|
||||
src1.y = src_1.s5;
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
coord_input.xy = coord_in.zw;
|
||||
VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
src0.x = src_0.s0;
|
||||
src0.y = src_0.s3;
|
||||
src1.x = src_1.s0;
|
||||
src1.y = src_1.s3;
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
src0.x = src_0.s1;
|
||||
src0.y = src_0.s4;
|
||||
src1.x = src_1.s1;
|
||||
src1.y = src_1.s4;
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
src0.x = src_0.s2;
|
||||
src0.y = src_0.s5;
|
||||
src1.x = src_1.s2;
|
||||
src1.y = src_1.s5;
|
||||
#if (VX_VERSION==1)
|
||||
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#else
|
||||
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
src1.s0 = src0.s1;
|
||||
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
#endif
|
||||
|
||||
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -1,6 +1,8 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform int indices_num;
|
||||
_viv_uniform int remainder;
|
||||
_viv_uniform int width;
|
||||
_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;
|
||||
|
||||
__kernel void gather_I8toI8_array(
|
||||
|
|
@ -131,10 +133,12 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \
|
|||
int axis_num \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
|
||||
\
|
||||
if (coord.x >= width) return; \
|
||||
Image img0 = create_image_from_image2d(input0, 1); \
|
||||
Image img1 = create_image_from_image2d(input1, 4); \
|
||||
Image img2 = create_image_from_image2d(output, 1); \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
|
||||
uchar* index_ptr = get_image_ptr_from_coord(img1, coord.xz); \
|
||||
__global int* index = (__global int*)index_ptr; \
|
||||
int4 indices = vload4(0, index); \
|
||||
|
|
@ -146,10 +150,30 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \
|
|||
__global data_type* data_ptr = (__global data_type*)input_ptr; \
|
||||
__global write_type* out_ptr = (__global write_type*)output_ptr; \
|
||||
indices = indices >= 0 ? indices : indices + axis_num; \
|
||||
src.s0 = data_ptr[indices.x]; \
|
||||
src.s1 = data_ptr[indices.y]; \
|
||||
src.s2 = data_ptr[indices.z]; \
|
||||
src.s3 = data_ptr[indices.w]; \
|
||||
if (coord.x + remainder < width) \
|
||||
{ \
|
||||
src.s0 = data_ptr[indices.x]; \
|
||||
src.s1 = data_ptr[indices.y]; \
|
||||
src.s2 = data_ptr[indices.z]; \
|
||||
src.s3 = data_ptr[indices.w]; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
__global data_type* out_ptr_remainder = (__global data_type*)output_ptr; \
|
||||
switch (remainder) \
|
||||
{ \
|
||||
case 3: \
|
||||
out_ptr_remainder[2] = data_ptr[indices.z]; \
|
||||
case 2: \
|
||||
out_ptr_remainder[1] = data_ptr[indices.y]; \
|
||||
case 1: \
|
||||
out_ptr_remainder[0] = data_ptr[indices.x]; \
|
||||
break; \
|
||||
default: \
|
||||
break; \
|
||||
} \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniExtraCopyDpKeepinEvis_2x8); \
|
||||
|
|
|
|||
|
|
@ -3,6 +3,9 @@
|
|||
#define logE (1.44269502f)
|
||||
#define twoLogE (2.88539004f)
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
|
|
@ -104,3 +107,53 @@ GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, U8, hsigmoid,
|
|||
#undef UCHAR8
|
||||
#undef SHORT8
|
||||
#undef HALF8
|
||||
|
||||
#define GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(activater) \
|
||||
__kernel void grucell_activation_BF16_BF16_BF16_to_BF16_##activater \
|
||||
( \
|
||||
__read_only image2d_array_t input0, \
|
||||
__read_only image2d_array_t input1, \
|
||||
__read_only image2d_array_t input2, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_array_t hstate, \
|
||||
int gate_activation, \
|
||||
int candidate_activation \
|
||||
) \
|
||||
{ \
|
||||
vxc_short8 src00, src10, src20, data0, data1; \
|
||||
float4 src01, src11, src21; \
|
||||
\
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
|
||||
VXC_ReadImage(src00, input0, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(src10, input1, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(src20, input2, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, src00, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src01, data0, 16); \
|
||||
VXC_DP2x8(data1, src10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src11, data1, 16); \
|
||||
VXC_DP2x8(data0, src20, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src21, data0, 16); \
|
||||
\
|
||||
src01 = src01 * tensorScale.xxxx - tensorZP.xxxx; \
|
||||
src01 = activater(src01); \
|
||||
\
|
||||
src11 = src11 * tensorScale.yyyy - tensorZP.yyyy; \
|
||||
src11 = tangentH(src11); \
|
||||
\
|
||||
src21 = src21 * tensorScale.zzzz - tensorZP.zzzz; \
|
||||
\
|
||||
src11 = src11 - src01 * src11; \
|
||||
src11 = src01 * src21 + src11; \
|
||||
\
|
||||
src11 = src11 * tensorScale.wwww + tensorZP.wwww; \
|
||||
_viv_asm(COPY, src00, src11, 16); \
|
||||
VXC_DP2x8(data0, src00, src00, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(sigmoid)
|
||||
GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(hsigmoid)
|
||||
|
|
|
|||
|
|
@ -3,6 +3,11 @@
|
|||
_viv_uniform VXC_512Bits uniA_Minus_B_2x8;
|
||||
_viv_uniform VXC_512Bits uniA_Times_B_2x8;
|
||||
_viv_uniform VXC_512Bits uniA_Plus_B_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
__kernel void grucell_activation_sma_F16_F16_F16toF16
|
||||
(
|
||||
__read_only image2d_array_t input0,
|
||||
|
|
@ -61,3 +66,101 @@ __kernel void grucell_activation_sma_F16_F16_F16toF16_2D
|
|||
VXC_WriteImage(h_status, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void grucell_activation_sma_BF16_BF16_BF16toBF16
|
||||
(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_array_t input1,
|
||||
__read_only image2d_array_t input2,
|
||||
__write_only image2d_array_t output,
|
||||
__write_only image2d_array_t h_status
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
|
||||
float4 src0, src00, src1, src11, src2, src22, minus, minus1, dst, dst1;
|
||||
vxc_ushort8 vec0, vec1, vec2, data0, data1;
|
||||
|
||||
VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage2DArray(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
VXC_DP2x8(data0, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
VXC_DP2x8(data1, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src0, data0, 16);
|
||||
_viv_asm(COPY, src00, data1, 16);
|
||||
VXC_DP2x8(data0, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
VXC_DP2x8(data1, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src1, data0, 16);
|
||||
_viv_asm(COPY, src11, data1, 16);
|
||||
VXC_DP2x8(data0, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
VXC_DP2x8(data1, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src2, data0, 16);
|
||||
_viv_asm(COPY, src22, data1, 16);
|
||||
|
||||
minus = src0 - src1;
|
||||
minus1 = src00 - src11;
|
||||
|
||||
dst = minus * src2 + src1;
|
||||
dst1 = minus1 * src22 + src11;
|
||||
_viv_asm(COPY, vec0, dst, 16);
|
||||
_viv_asm(COPY, vec1, dst1, 16);
|
||||
VXC_DP2x8(data0, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
|
||||
VXC_WriteImage2DArray(output, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage(h_status, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void grucell_activation_sma_BF16_BF16_BF16toBF16_2D
|
||||
(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_array_t input1,
|
||||
__read_only image2d_array_t input2,
|
||||
__write_only image2d_array_t output,
|
||||
__write_only image2d_array_t h_status
|
||||
)
|
||||
{
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));
|
||||
|
||||
float4 src0, src00, src1, src11, src2, src22, minus, minus1, dst, dst1;
|
||||
vxc_ushort8 vec0, vec1, vec2, data0, data1;
|
||||
|
||||
VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
VXC_DP2x8(data0, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
VXC_DP2x8(data1, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src0, data0, 16);
|
||||
_viv_asm(COPY, src00, data1, 16);
|
||||
VXC_DP2x8(data0, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
VXC_DP2x8(data1, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src1, data0, 16);
|
||||
_viv_asm(COPY, src11, data1, 16);
|
||||
VXC_DP2x8(data0, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
VXC_DP2x8(data1, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src2, data0, 16);
|
||||
_viv_asm(COPY, src22, data1, 16);
|
||||
|
||||
minus = src0 - src1;
|
||||
minus1 = src00 - src11;
|
||||
|
||||
dst = minus * src2 + src1;
|
||||
dst1 = minus1 * src22 + src11;
|
||||
_viv_asm(COPY, vec0, dst, 16);
|
||||
_viv_asm(COPY, vec1, dst1, 16);
|
||||
VXC_DP2x8(data0, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage(h_status, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,9 @@
|
|||
#define logE (1.44269502f)
|
||||
#define twoLogE (logE * 2.0f)
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
float4 sigmoid_func(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
|
|
@ -128,3 +131,52 @@ GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)
|
|||
GRUCELL_QNT_F16TO_QNT(U8, U8, HSIGMOID, hard_sigmoid, vxc_uchar8, vxc_uchar8)
|
||||
GRUCELL_QNT_F16TO_QNT(I8, I8, HSIGMOID, hard_sigmoid, vxc_char8, vxc_char8)
|
||||
GRUCELL_QNT_F16TO_QNT(I16, I16, HSIGMOID, hard_sigmoid, vxc_short8, vxc_short8)
|
||||
|
||||
#define GRUCELL_BF16(act_name, act_func) \
|
||||
__kernel void grucell_activation_z_h_BF16_BF16toBF16_##act_name( \
|
||||
__read_only image2d_t hstate_in, \
|
||||
__read_only image2d_t input_z_conv, \
|
||||
__read_only image2d_t input_h_conv, \
|
||||
__read_only image2d_t hstate_z_conv, \
|
||||
__read_only image2d_t hstate_h_conv, \
|
||||
__write_only image2d_t output, \
|
||||
__write_only image2d_t hstate_out \
|
||||
) \
|
||||
{ \
|
||||
int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
|
||||
vxc_short8 v0, v1, v2, v3, v4, v5, v6, data0, data1; \
|
||||
float4 src0, src1, src2, src3, src4, src5, src6; \
|
||||
VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src2, data0, 16); \
|
||||
VXC_DP2x8(data1, v4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src4, data1, 16); \
|
||||
VXC_DP2x8(data0, v5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src5, data0, 16); \
|
||||
VXC_DP2x8(data1, v6, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src6, data1, 16); \
|
||||
VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src3, data0, 16); \
|
||||
\
|
||||
float4 h = src2 + src4; \
|
||||
float4 z = src5 + src6; \
|
||||
h = tanh_func(h); \
|
||||
z = act_func(z); \
|
||||
float4 result = (1 - z) * h + z * src3; \
|
||||
_viv_asm(COPY, v0, result, 16); \
|
||||
VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage(hstate_out, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
GRUCELL_BF16(SIGMOID, sigmoid_func)
|
||||
GRUCELL_BF16(HSIGMOID, hard_sigmoid)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,344 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
#define logE (1.44269502f)
|
||||
#define twoLogE (2.88539004f)
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
|
||||
__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_NC(
|
||||
__read_only image2d_array_t prev_state,
|
||||
__read_only image2d_array_t input_r,
|
||||
__read_only image2d_array_t input_z,
|
||||
__read_only image2d_array_t input_c,
|
||||
__read_only image2d_array_t recur_r,
|
||||
__read_only image2d_array_t recur_z,
|
||||
__read_only image2d_array_t recur_c,
|
||||
__read_only image2d_t bias_r,
|
||||
__read_only image2d_t bias_z,
|
||||
__read_only image2d_t bias_c,
|
||||
__read_only image2d_t cond_r,
|
||||
__read_only image2d_t cond_z,
|
||||
__read_only image2d_t cond_c,
|
||||
__write_only image2d_array_t output,
|
||||
__write_only image2d_array_t hstate,
|
||||
int gate_activation, int candidate_activation, int batch_first)
|
||||
{
|
||||
vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;
|
||||
float4 r0, r1, z0, z1, c0, c1, state;
|
||||
float4 r, r2, r3, z, z2, z3, c, c2, c3;
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));
|
||||
|
||||
VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s7, prev_state, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
r2 = read_imagef(bias_r, coord);
|
||||
r3 = read_imagef(cond_r, coord);
|
||||
z2 = read_imagef(bias_z, coord);
|
||||
z3 = read_imagef(cond_z, coord);
|
||||
c2 = read_imagef(bias_c, coord);
|
||||
c3 = read_imagef(cond_c, coord);
|
||||
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, r0, data0, 16);
|
||||
VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, r1, data1, 16);
|
||||
VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, z0, data0, 16);
|
||||
VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, z1, data1, 16);
|
||||
VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, c0, data0, 16);
|
||||
VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, c1, data1, 16);
|
||||
VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, state, data0, 16);
|
||||
r = r0 + r1 + r2 + r3;
|
||||
z = z0 + z1 + z2 + z3;
|
||||
|
||||
r = sigmoid(r);
|
||||
z = sigmoid(z);
|
||||
|
||||
c = c2 * r + c3;
|
||||
c = c0 + c1 * r + c;
|
||||
c = tangentH(c);
|
||||
|
||||
state = z * (state - c) + c;
|
||||
_viv_asm(COPY, s0, state, 16);
|
||||
VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
|
||||
VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_CN(
|
||||
__read_only image2d_array_t prev_state,
|
||||
__read_only image2d_array_t input_r,
|
||||
__read_only image2d_array_t input_z,
|
||||
__read_only image2d_array_t input_c,
|
||||
__read_only image2d_array_t recur_r,
|
||||
__read_only image2d_array_t recur_z,
|
||||
__read_only image2d_array_t recur_c,
|
||||
__read_only image2d_t bias_r,
|
||||
__read_only image2d_t bias_z,
|
||||
__read_only image2d_t bias_c,
|
||||
__read_only image2d_t cond_r,
|
||||
__read_only image2d_t cond_z,
|
||||
__read_only image2d_t cond_c,
|
||||
__write_only image2d_array_t output,
|
||||
__write_only image2d_array_t hstate,
|
||||
int gate_activation, int candidate_activation, int batch_first)
|
||||
{
|
||||
vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;
|
||||
float4 r0, r1, z0, z1, c0, c1, state;
|
||||
float4 r, r2, r3, z, z2, z3, c, c2, c3;
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));
|
||||
|
||||
VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
r2 = read_imagef(bias_r, coord.yx);
|
||||
r3 = read_imagef(cond_r, coord.yx);
|
||||
z2 = read_imagef(bias_z, coord.yx);
|
||||
z3 = read_imagef(cond_z, coord.yx);
|
||||
c2 = read_imagef(bias_c, coord.yx);
|
||||
c3 = read_imagef(cond_c, coord.yx);
|
||||
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, r0, data0, 16);
|
||||
VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, r1, data1, 16);
|
||||
VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, z0, data0, 16);
|
||||
VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, z1, data1, 16);
|
||||
VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, c0, data0, 16);
|
||||
VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, c1, data1, 16);
|
||||
VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, state, data0, 16);
|
||||
|
||||
r = r0 + r1 + r2.xxxx + r3.xxxx;
|
||||
z = z0 + z1 + z2.xxxx + z3.xxxx;
|
||||
|
||||
r = sigmoid(r);
|
||||
z = sigmoid(z);
|
||||
|
||||
c = c2.xxxx * r + c3.xxxx;
|
||||
c = c0 + c1 * r + c;
|
||||
c = tangentH(c);
|
||||
state = z * (state - c) + c;
|
||||
|
||||
_viv_asm(COPY, s0, state, 16);
|
||||
VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
|
||||
VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x ++;
|
||||
VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x ++;
|
||||
VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x ++;
|
||||
VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_CN_FULL(
|
||||
__read_only image2d_array_t prev_state,
|
||||
__read_only image2d_array_t input_r,
|
||||
__read_only image2d_array_t input_z,
|
||||
__read_only image2d_array_t input_c,
|
||||
__read_only image2d_array_t recur_r,
|
||||
__read_only image2d_array_t recur_z,
|
||||
__read_only image2d_array_t recur_c,
|
||||
__read_only image2d_t bias_r,
|
||||
__read_only image2d_t bias_z,
|
||||
__read_only image2d_t bias_c,
|
||||
__read_only image2d_t cond_r,
|
||||
__read_only image2d_t cond_z,
|
||||
__read_only image2d_t cond_c,
|
||||
__write_only image2d_array_t output,
|
||||
__write_only image2d_array_t hstate,
|
||||
int gate_activation, int candidate_activation, int batch_first)
|
||||
{
|
||||
vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;
|
||||
float4 r0, r1, z0, z1, c0, c1, state;
|
||||
float4 r, r2, r3, z, z2, z3, c, c2, c3;
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));
|
||||
|
||||
VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s7, prev_state, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
r2 = read_imagef(bias_r, coord.yx);
|
||||
r3 = read_imagef(cond_r, coord.yx);
|
||||
z2 = read_imagef(bias_z, coord.yx);
|
||||
z3 = read_imagef(cond_z, coord.yx);
|
||||
c2 = read_imagef(bias_c, coord.yx);
|
||||
c3 = read_imagef(cond_c, coord.yx);
|
||||
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, r0, data0, 16);
|
||||
VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, r1, data1, 16);
|
||||
VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, z0, data0, 16);
|
||||
VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, z1, data1, 16);
|
||||
VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, c0, data0, 16);
|
||||
VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, c1, data1, 16);
|
||||
VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, state, data0, 16);
|
||||
|
||||
r = r0 + r1 + r2.xxxx + r3.xxxx;
|
||||
z = z0 + z1 + z2.xxxx + z3.xxxx;
|
||||
|
||||
r = sigmoid(r);
|
||||
z = sigmoid(z);
|
||||
|
||||
c = c2.xxxx * r + c3.xxxx;
|
||||
c = c0 + c1 * r + c;
|
||||
c = tangentH(c);
|
||||
state = z * (state - c) + c;
|
||||
|
||||
_viv_asm(COPY, s0, state, 16);
|
||||
VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
|
||||
__kernel void grucell_activation_cdnn_BF16_BF16_BF16_to_BF16(
|
||||
__read_only image2d_array_t prev_state,
|
||||
__read_only image2d_array_t input_rzc,
|
||||
__read_only image2d_array_t recur_rzc,
|
||||
__read_only image2d_t bias_r,
|
||||
__read_only image2d_t bias_z,
|
||||
__read_only image2d_t bias_c,
|
||||
__read_only image2d_t cond_r,
|
||||
__read_only image2d_t cond_z,
|
||||
__read_only image2d_t cond_c,
|
||||
__write_only image2d_array_t output,
|
||||
__write_only image2d_array_t hstate,
|
||||
int gate_activation, int candidate_activation, int batch_first)
|
||||
{
|
||||
vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;
|
||||
float4 r0, r1, z0, z1, c0, c1, state;
|
||||
float4 r, r2, r3, z, z2, z3, c, c2, c3;
|
||||
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1) * 3, get_global_id(1));
|
||||
|
||||
VXC_ReadImage(s0, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s1, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s2, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s3, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s4, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s5, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(s7, prev_state, coord.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
r2 = read_imagef(bias_r, coord.xy);
|
||||
r3 = read_imagef(cond_r, coord.xy);
|
||||
z2 = read_imagef(bias_z, coord.xy);
|
||||
z3 = read_imagef(cond_z, coord.xy);
|
||||
c2 = read_imagef(bias_c, coord.xy);
|
||||
c3 = read_imagef(cond_c, coord.xy);
|
||||
|
||||
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, r0, data0, 16);
|
||||
VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, r1, data1, 16);
|
||||
VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, z0, data0, 16);
|
||||
VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, z1, data1, 16);
|
||||
VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, c0, data0, 16);
|
||||
VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, c1, data1, 16);
|
||||
VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, state, data0, 16);
|
||||
|
||||
r = r0 + r1 + r2 + r3;
|
||||
z = z0 + z1 + z2 + z3;
|
||||
|
||||
r = sigmoid(r);
|
||||
z = sigmoid(z);
|
||||
|
||||
c = c2 * r + c3;
|
||||
c = c0 + c1 * r + c;
|
||||
c = tangentH(c);
|
||||
state = z * (state - c) + c;
|
||||
|
||||
_viv_asm(COPY, s0, state, 16);
|
||||
VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_WriteImage(output, coord.xy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage(hstate, coord.xy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -3,6 +3,9 @@
|
|||
#define logE (1.44269502f)
|
||||
#define twoLogE (logE * 2.0f)
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
float4 sigmoid_func(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
|
|
@ -98,3 +101,39 @@ GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8)
|
|||
GRUCELL_QNT_F16TO_F16(U8, HSIGMOID, hard_sigmoid, vxc_uchar8)
|
||||
GRUCELL_QNT_F16TO_F16(I8, HSIGMOID, hard_sigmoid, vxc_char8)
|
||||
GRUCELL_QNT_F16TO_F16(I16, HSIGMOID, hard_sigmoid, vxc_short8)
|
||||
|
||||
#define GRUCELL_BF16(act_name, act_func) \
|
||||
__kernel void grucell_h_times_activation_r_BF16_BF16toBF16_##act_name( \
|
||||
__read_only image2d_t hstate_in, \
|
||||
__read_only image2d_t input_r_conv, \
|
||||
__read_only image2d_t hstate_r_conv, \
|
||||
__write_only image2d_t output \
|
||||
) \
|
||||
{ \
|
||||
int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
|
||||
vxc_short8 v0, v1, v2, v3, data0, data1; \
|
||||
float4 src0, src1, src2, src3; \
|
||||
VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src0, data0, 16); \
|
||||
VXC_DP2x8(data1, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src1, data1, 16); \
|
||||
VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src3, data0, 16); \
|
||||
\
|
||||
float4 r; \
|
||||
r = src0 + src1; \
|
||||
r = act_func(r); \
|
||||
float4 result = r * src3; \
|
||||
_viv_asm(COPY, v0, result, 16); \
|
||||
VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
GRUCELL_BF16(SIGMOID, sigmoid_func)
|
||||
GRUCELL_BF16(HSIGMOID, hard_sigmoid)
|
||||
|
|
|
|||
|
|
@ -3,6 +3,9 @@
|
|||
#define logE (1.44269502f)
|
||||
#define twoLogE (logE * 2.0f)
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
float4 sigmoid_func(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
|
|
@ -150,3 +153,65 @@ GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID, tanh_func, sigmoid_func,
|
|||
GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8)
|
||||
GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_char8, vxc_char8)
|
||||
GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8)
|
||||
|
||||
#define GRUCELL_BF16(act_name, act_func, rec_act_name, rec_act_func) \
|
||||
__kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_act_name( \
|
||||
__read_only image2d_t hstate_in, \
|
||||
__read_only image2d_t input_z_conv, \
|
||||
__read_only image2d_t input_r_conv, \
|
||||
__read_only image2d_t input_h_conv, \
|
||||
__read_only image2d_t hstate_z_conv, \
|
||||
__read_only image2d_t hstate_r_conv, \
|
||||
__read_only image2d_t hstate_h_conv, \
|
||||
__write_only image2d_t output, \
|
||||
__write_only image2d_t hstate_out \
|
||||
) \
|
||||
{ \
|
||||
int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
|
||||
vxc_short8 v0, v1, v2, v3, v4, v5, v6, data0, data1; \
|
||||
float4 src0, src1, src2, src3, src4, src5, src6; \
|
||||
VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src0, data0, 16); \
|
||||
VXC_DP2x8(data1, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src1, data1, 16); \
|
||||
VXC_DP2x8(data0, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src2, data0, 16); \
|
||||
VXC_DP2x8(data1, v4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src4, data1, 16); \
|
||||
VXC_DP2x8(data0, v5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src5, data0, 16); \
|
||||
VXC_DP2x8(data1, v6, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src6, data1, 16); \
|
||||
VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src3, data0, 16); \
|
||||
\
|
||||
float4 r; \
|
||||
r = src0 + src1; \
|
||||
r = rec_act_func(r); \
|
||||
float4 h = src4 + r * src2; \
|
||||
float4 z = src5 + src6; \
|
||||
h = act_func(h); \
|
||||
z = rec_act_func(z); \
|
||||
float4 result = (1 - z) * h + z * src3; \
|
||||
_viv_asm(COPY, v0, result, 16); \
|
||||
VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage(hstate_out, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
GRUCELL_BF16(TANH, tanh_func, SIGMOID, sigmoid_func)
|
||||
GRUCELL_BF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,124 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
#define LSTMUNIT_BP_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_BP_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_i_conv, \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_array_t hstate_i_conv, \
|
||||
__read_only image2d_array_t hstate_f_conv, \
|
||||
__read_only image2d_array_t hstate_c_conv, \
|
||||
__read_only image2d_array_t hstate_o_conv, \
|
||||
__read_only image2d_t bias_i, \
|
||||
__read_only image2d_t bias_f, \
|
||||
__read_only image2d_t bias_c, \
|
||||
__read_only image2d_t bias_o, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
|
||||
float4 src0, src1, src2, src3, src4; \
|
||||
vxc_short8 vect10, vect11, vect12, vect13; \
|
||||
float4 src10, src11, src12, src13; \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
float4 b0, b1, b2, b3; \
|
||||
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
b0 = read_imagef(bias_i, coord_in.xw); \
|
||||
b1 = read_imagef(bias_f, coord_in.xw); \
|
||||
b2 = read_imagef(bias_c, coord_in.xw); \
|
||||
b3 = read_imagef(bias_o, coord_in.xw); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src0, data0, 16); \
|
||||
VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src10, data1, 16); \
|
||||
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src1, data0, 16); \
|
||||
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src11, data1, 16); \
|
||||
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src2, data0, 16); \
|
||||
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src12, data1, 16); \
|
||||
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src3, data0, 16); \
|
||||
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src13, data1, 16); \
|
||||
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data0, 16); \
|
||||
data_i_t = src0 + src10 + b0; \
|
||||
data_f_t = src1 + src11 + b1; \
|
||||
data_g_t = src2 + src12 + b2; \
|
||||
data_o_t = src3 + src13 + b3; \
|
||||
\
|
||||
data_i_t = act_func(data_i_t); \
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, vect0, data_c_t, 16); \
|
||||
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, vect1, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_BP_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_BP_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,126 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
#define LSTMUNIT_B_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_B_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_i_conv, \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_array_t hstate_i_conv, \
|
||||
__read_only image2d_array_t hstate_f_conv, \
|
||||
__read_only image2d_array_t hstate_c_conv, \
|
||||
__read_only image2d_array_t hstate_o_conv, \
|
||||
__read_only image2d_t bias_i, \
|
||||
__read_only image2d_t bias_f, \
|
||||
__read_only image2d_t bias_c, \
|
||||
__read_only image2d_t bias_o, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
__write_only image2d_t h_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
|
||||
float4 src0, src1, src2, src3, src4; \
|
||||
vxc_short8 vect10, vect11, vect12, vect13; \
|
||||
float4 src10, src11, src12, src13; \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
float4 b0, b1, b2, b3; \
|
||||
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
b0 = read_imagef(bias_i, coord_in.xw); \
|
||||
b1 = read_imagef(bias_f, coord_in.xw); \
|
||||
b2 = read_imagef(bias_c, coord_in.xw); \
|
||||
b3 = read_imagef(bias_o, coord_in.xw); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src0, data0, 16); \
|
||||
VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src10, data1, 16); \
|
||||
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src1, data0, 16); \
|
||||
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src11, data1, 16); \
|
||||
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src2, data0, 16); \
|
||||
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src12, data1, 16); \
|
||||
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src3, data0, 16); \
|
||||
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src13, data1, 16); \
|
||||
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data0, 16); \
|
||||
data_i_t = src0 + src10 + b0; \
|
||||
data_f_t = src1 + src11 + b1; \
|
||||
data_g_t = src2 + src12 + b2; \
|
||||
data_o_t = src3 + src13 + b3; \
|
||||
\
|
||||
data_i_t = act_func(data_i_t); \
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, vect0, data_c_t, 16); \
|
||||
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, vect1, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_B_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_B_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
#define LSTMUNIT_CBP_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_CBP_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_array_t hstate_f_conv, \
|
||||
__read_only image2d_array_t hstate_c_conv, \
|
||||
__read_only image2d_array_t hstate_o_conv, \
|
||||
__read_only image2d_t bias_f, \
|
||||
__read_only image2d_t bias_c, \
|
||||
__read_only image2d_t bias_o, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
|
||||
float4 src0, src1, src2, src3, src4; \
|
||||
vxc_short8 vect10, vect11, vect12, vect13; \
|
||||
float4 src10, src11, src12, src13; \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
float4 b0, b1, b2, b3; \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
b1 = read_imagef(bias_f, coord_in.xw); \
|
||||
b2 = read_imagef(bias_c, coord_in.xw); \
|
||||
b3 = read_imagef(bias_o, coord_in.xw); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src1, data0, 16); \
|
||||
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src11, data1, 16); \
|
||||
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src2, data0, 16); \
|
||||
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src12, data1, 16); \
|
||||
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src3, data0, 16); \
|
||||
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src13, data1, 16); \
|
||||
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data0, 16); \
|
||||
data_f_t = src1 + src11 + b1; \
|
||||
data_g_t = src2 + src12 + b2; \
|
||||
data_o_t = src3 + src13 + b3; \
|
||||
\
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = 1.0 - data_f_t; \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, vect0, data_c_t, 16); \
|
||||
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, vect1, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_CBP_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_CBP_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
#define LSTMUNIT_CB_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_CB_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_array_t hstate_f_conv, \
|
||||
__read_only image2d_array_t hstate_c_conv, \
|
||||
__read_only image2d_array_t hstate_o_conv, \
|
||||
__read_only image2d_t bias_f, \
|
||||
__read_only image2d_t bias_c, \
|
||||
__read_only image2d_t bias_o, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
__write_only image2d_t h_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
|
||||
float4 src0, src1, src2, src3, src4; \
|
||||
vxc_short8 vect10, vect11, vect12, vect13; \
|
||||
float4 src10, src11, src12, src13; \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
float4 b0, b1, b2, b3; \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
b1 = read_imagef(bias_f, coord_in.xw); \
|
||||
b2 = read_imagef(bias_c, coord_in.xw); \
|
||||
b3 = read_imagef(bias_o, coord_in.xw); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src1, data0, 16); \
|
||||
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src11, data1, 16); \
|
||||
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src2, data0, 16); \
|
||||
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src12, data1, 16); \
|
||||
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src3, data0, 16); \
|
||||
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src13, data1, 16); \
|
||||
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data0, 16); \
|
||||
data_f_t = src1 + src11 + b1; \
|
||||
data_g_t = src2 + src12 + b2; \
|
||||
data_o_t = src3 + src13 + b3; \
|
||||
\
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = 1.0 - data_f_t; \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, vect0, data_c_t, 16); \
|
||||
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, vect1, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_CB_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_CB_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
_viv_uniform VXC_512Bits uniExtractHalf4_4x4;
|
||||
#define LSTMUNIT_CLP_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_CLP_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_t bias_f, \
|
||||
__read_only image2d_t bias_c, \
|
||||
__read_only image2d_t bias_o, \
|
||||
__read_only image2d_t layer_norm_wf, \
|
||||
__read_only image2d_t layer_norm_wc, \
|
||||
__read_only image2d_t layer_norm_wo, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
float4 w0, w1, w2, b0, b1, b2; \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
w0 = read_imagef(layer_norm_wf, coord_in.xw); \
|
||||
w1 = read_imagef(layer_norm_wc, coord_in.xw); \
|
||||
w2 = read_imagef(layer_norm_wo, coord_in.xw); \
|
||||
b0 = read_imagef(bias_f, coord_in.xw); \
|
||||
b1 = read_imagef(bias_c, coord_in.xw); \
|
||||
b2 = read_imagef(bias_o, coord_in.xw); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_f_t, data0, 16); \
|
||||
VXC_DP2x8(data1, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_g_t, data1, 16); \
|
||||
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_o_t, data0, 16); \
|
||||
VXC_DP2x8(data1, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data1, 16); \
|
||||
\
|
||||
data_f_t = data_f_t * w0 + b0; \
|
||||
data_g_t = data_g_t * w1 + b1; \
|
||||
data_o_t = data_o_t * w2 + b2; \
|
||||
\
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = 1.0 - data_f_t; \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, vect0, data_c_t, 16); \
|
||||
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, vect1, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_CLP_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_CLP_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,102 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
#define LSTMUNIT_CL_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_CL_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_t bias_f, \
|
||||
__read_only image2d_t bias_c, \
|
||||
__read_only image2d_t bias_o, \
|
||||
__read_only image2d_t layer_norm_wf, \
|
||||
__read_only image2d_t layer_norm_wc, \
|
||||
__read_only image2d_t layer_norm_wo, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
__write_only image2d_t h_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
float4 w0, w1, w2, b0, b1, b2; \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
w0 = read_imagef(layer_norm_wf, coord_in.xw); \
|
||||
w1 = read_imagef(layer_norm_wc, coord_in.xw); \
|
||||
w2 = read_imagef(layer_norm_wo, coord_in.xw); \
|
||||
b0 = read_imagef(bias_f, coord_in.xw); \
|
||||
b1 = read_imagef(bias_c, coord_in.xw); \
|
||||
b2 = read_imagef(bias_o, coord_in.xw); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_f_t, data0, 16); \
|
||||
VXC_DP2x8(data1, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_g_t, data1, 16); \
|
||||
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_o_t, data0, 16); \
|
||||
VXC_DP2x8(data1, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data1, 16); \
|
||||
\
|
||||
data_f_t = data_f_t * w0 + b0; \
|
||||
data_g_t = data_g_t * w1 + b1; \
|
||||
data_o_t = data_o_t * w2 + b2; \
|
||||
\
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = 1.0 - data_f_t; \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, vect0, data_c_t, 16); \
|
||||
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, vect1, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_CL_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_CL_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,104 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
#define LSTMUNIT_CSP_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_CSP_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_array_t hstate_f_conv, \
|
||||
__read_only image2d_array_t hstate_c_conv, \
|
||||
__read_only image2d_array_t hstate_o_conv, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
|
||||
float4 src0, src1, src2, src3, src4; \
|
||||
vxc_short8 vect10, vect11, vect12, vect13; \
|
||||
float4 src10, src11, src12, src13; \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src1, data0, 16); \
|
||||
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src11, data1, 16); \
|
||||
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src2, data0, 16); \
|
||||
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src12, data1, 16); \
|
||||
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src3, data0, 16); \
|
||||
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src13, data1, 16); \
|
||||
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data0, 16); \
|
||||
data_f_t = src1 + src11; \
|
||||
data_g_t = src2 + src12; \
|
||||
data_o_t = src3 + src13; \
|
||||
\
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = 1.0 - data_f_t; \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, vect0, data_c_t, 16); \
|
||||
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, vect1, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_CSP_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_CSP_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
#define LSTMUNIT_CS_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_CS_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_array_t hstate_f_conv, \
|
||||
__read_only image2d_array_t hstate_c_conv, \
|
||||
__read_only image2d_array_t hstate_o_conv, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
__write_only image2d_t h_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
|
||||
float4 src0, src1, src2, src3, src4; \
|
||||
vxc_short8 vect10, vect11, vect12, vect13; \
|
||||
float4 src10, src11, src12, src13; \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src1, data0, 16); \
|
||||
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src11, data1, 16); \
|
||||
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src2, data0, 16); \
|
||||
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src12, data1, 16); \
|
||||
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src3, data0, 16); \
|
||||
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src13, data1, 16); \
|
||||
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data0, 16); \
|
||||
data_f_t = src1 + src11; \
|
||||
data_g_t = src2 + src12; \
|
||||
data_o_t = src3 + src13; \
|
||||
\
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = 1.0 - data_f_t; \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, vect0, data_c_t, 16); \
|
||||
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, vect1, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_CS_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_CS_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
#define LSTMUNIT_LP_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_LP_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_i_conv, \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_t bias_i, \
|
||||
__read_only image2d_t bias_f, \
|
||||
__read_only image2d_t bias_c, \
|
||||
__read_only image2d_t bias_o, \
|
||||
__read_only image2d_t layer_norm_wi, \
|
||||
__read_only image2d_t layer_norm_wf, \
|
||||
__read_only image2d_t layer_norm_wc, \
|
||||
__read_only image2d_t layer_norm_wo, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
float4 w0, w1, w2, w3, b0, b1, b2, b3; \
|
||||
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
w0 = read_imagef(layer_norm_wi, coord_in.xw); \
|
||||
w1 = read_imagef(layer_norm_wf, coord_in.xw); \
|
||||
w2 = read_imagef(layer_norm_wc, coord_in.xw); \
|
||||
w3 = read_imagef(layer_norm_wo, coord_in.xw); \
|
||||
b0 = read_imagef(bias_i, coord_in.xw); \
|
||||
b1 = read_imagef(bias_f, coord_in.xw); \
|
||||
b2 = read_imagef(bias_c, coord_in.xw); \
|
||||
b3 = read_imagef(bias_o, coord_in.xw); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_i_t, data0, 16); \
|
||||
VXC_DP2x8(data1, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_f_t, data1, 16); \
|
||||
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_g_t, data0, 16); \
|
||||
VXC_DP2x8(data1, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_o_t, data1, 16); \
|
||||
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data0, 16); \
|
||||
\
|
||||
data_i_t = data_i_t * w0 + b0; \
|
||||
data_f_t = data_f_t * w1 + b1; \
|
||||
data_g_t = data_g_t * w2 + b2; \
|
||||
data_o_t = data_o_t * w3 + b3; \
|
||||
\
|
||||
data_i_t = act_func(data_i_t); \
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, vect0, data_c_t, 16); \
|
||||
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, vect1, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_LP_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_LP_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
#define LSTMUNIT_L_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_L_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_i_conv, \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_t bias_i, \
|
||||
__read_only image2d_t bias_f, \
|
||||
__read_only image2d_t bias_c, \
|
||||
__read_only image2d_t bias_o, \
|
||||
__read_only image2d_t layer_norm_wi, \
|
||||
__read_only image2d_t layer_norm_wf, \
|
||||
__read_only image2d_t layer_norm_wc, \
|
||||
__read_only image2d_t layer_norm_wo, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
__write_only image2d_t h_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
float4 w0, w1, w2, w3, b0, b1, b2, b3; \
|
||||
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
w0 = read_imagef(layer_norm_wi, coord_in.xw); \
|
||||
w1 = read_imagef(layer_norm_wf, coord_in.xw); \
|
||||
w2 = read_imagef(layer_norm_wc, coord_in.xw); \
|
||||
w3 = read_imagef(layer_norm_wo, coord_in.xw); \
|
||||
b0 = read_imagef(bias_i, coord_in.xw); \
|
||||
b1 = read_imagef(bias_f, coord_in.xw); \
|
||||
b2 = read_imagef(bias_c, coord_in.xw); \
|
||||
b3 = read_imagef(bias_o, coord_in.xw); \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_i_t, data0, 16); \
|
||||
VXC_DP2x8(data1, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_f_t, data1, 16); \
|
||||
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_g_t, data0, 16); \
|
||||
VXC_DP2x8(data1, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_o_t, data1, 16); \
|
||||
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data0, 16); \
|
||||
\
|
||||
data_i_t = data_i_t * w0 + b0; \
|
||||
data_f_t = data_f_t * w1 + b1; \
|
||||
data_g_t = data_g_t * w2 + b2; \
|
||||
data_o_t = data_o_t * w3 + b3; \
|
||||
\
|
||||
data_i_t = act_func(data_i_t); \
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, vect0, data_c_t, 16); \
|
||||
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, vect1, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_L_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_L_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
#define LSTMUNIT_SP_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_SP_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_i_conv, \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_array_t hstate_i_conv, \
|
||||
__read_only image2d_array_t hstate_f_conv, \
|
||||
__read_only image2d_array_t hstate_c_conv, \
|
||||
__read_only image2d_array_t hstate_o_conv, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1, data2, data3; \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
vxc_float4 src0, src1, src2, src3; \
|
||||
vxc_short8 vect10, vect11, vect12, vect13; \
|
||||
vxc_float4 src10, src11, src12, src13; \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src0, data0, 16); \
|
||||
VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src10, data1, 16); \
|
||||
VXC_DP2x8(data2, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src1, data2, 16); \
|
||||
VXC_DP2x8(data3, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src11, data3, 16); \
|
||||
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src2, data0, 16); \
|
||||
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src12, data1, 16); \
|
||||
VXC_DP2x8(data2, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src3, data2, 16); \
|
||||
VXC_DP2x8(data3, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src13, data3, 16); \
|
||||
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data0, 16); \
|
||||
data_i_t = src0 + src10; \
|
||||
data_f_t = src1 + src11; \
|
||||
data_g_t = src2 + src12; \
|
||||
data_o_t = src3 + src13; \
|
||||
\
|
||||
data_i_t = act_func(data_i_t); \
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, data0, data_c_t, 16); \
|
||||
VXC_DP2x8(data1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, data0, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_SP_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_SP_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,118 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float logE;
|
||||
_viv_uniform float twoLogE;
|
||||
_viv_uniform float forget_bias;
|
||||
float4 sigmoid(float4 x)
|
||||
{
|
||||
x *= -logE;
|
||||
x = 1 + exp2(x);
|
||||
return 1 / x;
|
||||
}
|
||||
float4 hard_sigmoid(float4 x)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
float4 tangentH(float4 x)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
x = 1 / x;
|
||||
return 2 * x - 1;
|
||||
}
|
||||
_viv_uniform float outputScale;
|
||||
_viv_uniform float outputZP;
|
||||
_viv_uniform float4 clip_Min_F;
|
||||
_viv_uniform float4 clip_Max_F;
|
||||
|
||||
#define LSTMUNIT_S_BF16(act_name, act_func) \
|
||||
__kernel void lstmunit_activation_S_BF16toBF16_BF16_##act_name( \
|
||||
__read_only image2d_array_t input_i_conv, \
|
||||
__read_only image2d_array_t input_f_conv, \
|
||||
__read_only image2d_array_t input_c_conv, \
|
||||
__read_only image2d_array_t input_o_conv, \
|
||||
__read_only image2d_t cell_state_in, \
|
||||
__read_only image2d_array_t hstate_i_conv, \
|
||||
__read_only image2d_array_t hstate_f_conv, \
|
||||
__read_only image2d_array_t hstate_c_conv, \
|
||||
__read_only image2d_array_t hstate_o_conv, \
|
||||
__write_only image2d_array_t output, \
|
||||
__write_only image2d_t cell_state_out, \
|
||||
__write_only image2d_t h_state_out, \
|
||||
int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \
|
||||
vxc_float4 src0, src1, src2, src3, src4; \
|
||||
vxc_short8 vect10, vect11, vect12, vect13; \
|
||||
vxc_float4 src10, src11, src12, src13; \
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
|
||||
float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
|
||||
VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src0, data0, 16); \
|
||||
VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src10, data1, 16); \
|
||||
VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src1, data0, 16); \
|
||||
VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src11, data1, 16); \
|
||||
VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src2, data0, 16); \
|
||||
VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src12, data1, 16); \
|
||||
VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src3, data0, 16); \
|
||||
VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, src13, data1, 16); \
|
||||
VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvBF16toF32_Part0_2x8); \
|
||||
_viv_asm(COPY, data_c_t, data0, 16); \
|
||||
\
|
||||
data_i_t = src0 + src10; \
|
||||
data_f_t = src1 + src11; \
|
||||
data_g_t = src2 + src12; \
|
||||
data_o_t = src3 + src13; \
|
||||
\
|
||||
data_i_t = act_func(data_i_t); \
|
||||
data_f_t = act_func(data_f_t + forget_bias); \
|
||||
data_g_t = tangentH(data_g_t); \
|
||||
data_i_t = data_i_t * data_g_t; \
|
||||
data_c_t = data_c_t * data_f_t + data_i_t; \
|
||||
data_o_t = act_func(data_o_t); \
|
||||
data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \
|
||||
data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \
|
||||
_viv_asm(COPY, vect0, data_c_t, 16); \
|
||||
VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
data_c_t = tangentH(data_c_t); \
|
||||
data_o_t = data_o_t * data_c_t * outputScale + outputZP; \
|
||||
_viv_asm(COPY, vect1, data_o_t, 16); \
|
||||
VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
|
||||
VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
LSTMUNIT_S_BF16(SIGMOID, sigmoid)
|
||||
LSTMUNIT_S_BF16(HARD_SIGMOID, hard_sigmoid)
|
||||
|
|
@ -0,0 +1,283 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvF16toFp32_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
_viv_uniform float inout_scale;
|
||||
_viv_uniform float inout_tail;
|
||||
|
||||
_viv_uniform int width;
|
||||
_viv_uniform int height;
|
||||
|
||||
#define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val) \
|
||||
__kernel void maxpool_##in_name##to##out_name( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
int stride_x, int stride_y, int pad_x, int pad_y, \
|
||||
int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \
|
||||
{ \
|
||||
int gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
int gidz = get_global_id(2); \
|
||||
int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
|
||||
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \
|
||||
int4 coord_in = coord_out; \
|
||||
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \
|
||||
for(; pos_start.x < 0;) \
|
||||
{ \
|
||||
pos_start.x += dilation_x; \
|
||||
} \
|
||||
for(; pos_start.y < 0;) \
|
||||
{ \
|
||||
pos_start.y += dilation_y; \
|
||||
} \
|
||||
pos_end = min(pos_end, (int2)(width, height)); \
|
||||
\
|
||||
src_type src0; \
|
||||
dst_type maxVal; \
|
||||
maxVal.x = max_val; \
|
||||
\
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr_a); \
|
||||
\
|
||||
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \
|
||||
{ \
|
||||
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \
|
||||
{ \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x += dilation_x; \
|
||||
VXC_VertMax3_Integer(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
float4 fValTmp; \
|
||||
fValTmp.x = maxVal.x * inout_scale + inout_tail; \
|
||||
int4 i4Val = convert_int4_rte(fValTmp); \
|
||||
VXC_DP2x8(maxVal, i4Val, i4Val, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), \
|
||||
uniConvertInt32toUint8_2x8); \
|
||||
VXC_WriteImage2DArray(output, coord_out, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
MAXPOOL_QINT(U8, U8, vxc_uchar8, vxc_uchar8, 0)
|
||||
MAXPOOL_QINT(I8, I8, vxc_char8, vxc_char8, -128)
|
||||
MAXPOOL_QINT(I16, I16, vxc_short8, vxc_short8, -32768)
|
||||
|
||||
__kernel void maxpool_F16toF16(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int stride_x, int stride_y, int pad_x, int pad_y,
|
||||
int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
int gidz = get_global_id(2);
|
||||
int4 coord_out = (int4)(gidx, gidy, gidz, 0);
|
||||
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);
|
||||
int4 coord_in = coord_out;
|
||||
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);
|
||||
for(; pos_start.x < 0;)
|
||||
{
|
||||
pos_start.x += dilation_x;
|
||||
}
|
||||
for(; pos_start.y < 0;)
|
||||
{
|
||||
pos_start.y += dilation_y;
|
||||
}
|
||||
pos_end = min(pos_end, (int2)(width, height));
|
||||
|
||||
vxc_short8 data0;
|
||||
vxc_half8 maxVal, src0;
|
||||
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr_a);
|
||||
coord_in.xy = pos_start;
|
||||
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, maxVal, data0, 16);
|
||||
|
||||
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)
|
||||
{
|
||||
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)
|
||||
{
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.x += dilation_x;
|
||||
_viv_asm(COPY, src0, data0, 16);
|
||||
VXC_VertMax3_Half(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
_viv_asm(COPY, data0, maxVal, 16);
|
||||
VXC_WriteImage2DArray(output, coord_out, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
#define MAXPOOL_F16_TO_QINT(out_name, dst_type) \
|
||||
__kernel void maxpool_F16to##out_name( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
int stride_x, int stride_y, int pad_x, int pad_y, \
|
||||
int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \
|
||||
{ \
|
||||
int gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
int gidz = get_global_id(2); \
|
||||
int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
|
||||
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \
|
||||
int4 coord_in = coord_out; \
|
||||
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \
|
||||
for(; pos_start.x < 0;) \
|
||||
{ \
|
||||
pos_start.x += dilation_x; \
|
||||
} \
|
||||
for(; pos_start.y < 0;) \
|
||||
{ \
|
||||
pos_start.y += dilation_y; \
|
||||
} \
|
||||
pos_end = min(pos_end, (int2)(width, height)); \
|
||||
\
|
||||
vxc_short8 data0; \
|
||||
vxc_half8 maxVal, src0; \
|
||||
\
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr_a); \
|
||||
coord_in.xy = pos_start; \
|
||||
\
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, maxVal, data0, 16); \
|
||||
\
|
||||
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \
|
||||
{ \
|
||||
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \
|
||||
{ \
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x += dilation_x; \
|
||||
_viv_asm(COPY, src0, data0, 16); \
|
||||
VXC_VertMax3_Half(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
} \
|
||||
} \
|
||||
float4 fValTmp; \
|
||||
VXC_DP4x4(fValTmp, maxVal, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvF16toFp32_4x4); \
|
||||
fValTmp.x = fValTmp.x * inout_scale + inout_tail; \
|
||||
int4 i4Val = convert_int4_rte(fValTmp); \
|
||||
dst_type dst; \
|
||||
VXC_DP2x8(dst, i4Val, i4Val, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), \
|
||||
uniConvertInt32toUint8_2x8); \
|
||||
VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
|
||||
MAXPOOL_F16_TO_QINT(U8, vxc_uchar8)
|
||||
MAXPOOL_F16_TO_QINT(I8, vxc_char8)
|
||||
MAXPOOL_F16_TO_QINT(I16, vxc_short8)
|
||||
|
||||
#define MAXPOOL_QINT_TO_F16(in_name, src_type, max_val) \
|
||||
__kernel void maxpool_##in_name##toF16( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
int stride_x, int stride_y, int pad_x, int pad_y, \
|
||||
int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \
|
||||
{ \
|
||||
int gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
int gidz = get_global_id(2); \
|
||||
int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
|
||||
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \
|
||||
int4 coord_in = coord_out; \
|
||||
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \
|
||||
for(; pos_start.x < 0;) \
|
||||
{ \
|
||||
pos_start.x += dilation_x; \
|
||||
} \
|
||||
for(; pos_start.y < 0;) \
|
||||
{ \
|
||||
pos_start.y += dilation_y; \
|
||||
} \
|
||||
pos_end = min(pos_end, (int2)(width, height)); \
|
||||
\
|
||||
src_type src0, maxVal; \
|
||||
maxVal.x = max_val; \
|
||||
\
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr_a); \
|
||||
\
|
||||
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \
|
||||
{ \
|
||||
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \
|
||||
{ \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x += dilation_x; \
|
||||
VXC_VertMax3_Integer(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
float4 fValTmp; \
|
||||
fValTmp.x = maxVal.x * inout_scale + inout_tail; \
|
||||
half4 h4Val; \
|
||||
_viv_asm(CONV, h4Val, fValTmp); \
|
||||
vxc_short8 dst; \
|
||||
_viv_asm(COPY, dst, h4Val, 4); \
|
||||
VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
MAXPOOL_QINT_TO_F16(U8, vxc_uchar8, 0)
|
||||
MAXPOOL_QINT_TO_F16(I8, vxc_char8, -128)
|
||||
MAXPOOL_QINT_TO_F16(I16, vxc_short8, -32768)
|
||||
|
||||
__kernel void maxpool_BF16toBF16(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int stride_x, int stride_y, int pad_x, int pad_y,
|
||||
int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
int gidz = get_global_id(2);
|
||||
int4 coord_out = (int4)(gidx, gidy, gidz, 0);
|
||||
int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);
|
||||
int4 coord_in = coord_out;
|
||||
int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);
|
||||
for(; pos_start.x < 0;)
|
||||
{
|
||||
pos_start.x += dilation_x;
|
||||
}
|
||||
for(; pos_start.y < 0;)
|
||||
{
|
||||
pos_start.y += dilation_y;
|
||||
}
|
||||
pos_end = min(pos_end, (int2)(width, height));
|
||||
|
||||
vxc_short8 data0, val0;
|
||||
float4 maxVal, src0;
|
||||
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr_a);
|
||||
coord_in.xy = pos_start;
|
||||
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
|
||||
VXC_DP2x8(val0, data0, zero, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, maxVal, val0, 4);
|
||||
|
||||
for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)
|
||||
{
|
||||
for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)
|
||||
{
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.x += dilation_x;
|
||||
VXC_DP2x8(val0, data0, zero, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, src0, val0, 4);
|
||||
maxVal = max(src0, maxVal);
|
||||
}
|
||||
}
|
||||
_viv_asm(COPY, data0, maxVal, 16);
|
||||
VXC_DP2x8(val0, data0, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_WriteImage2DArray(output, coord_out, val0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -17,6 +17,8 @@ _viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
|
|||
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractYtoShortSub16_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;
|
||||
|
||||
#define NV12_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
|
||||
__kernel void pre_process_nv12_copy_##name \
|
||||
|
|
@ -57,14 +59,24 @@ __kernel void pre_process_nv12_copy_##name \
|
|||
UV.s0123 = UV.s1032; \
|
||||
} \
|
||||
\
|
||||
vxc_short8 tmpY; \
|
||||
vxc_char16 tmpUV; \
|
||||
short tmpVal = 128; \
|
||||
short tmpVal = 16; \
|
||||
VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYtoShortSub16_2x8); \
|
||||
tmpVal = 128; \
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \
|
||||
\
|
||||
float4 tmpDstB, tmpDstG, tmpDstR; \
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \
|
||||
vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
|
||||
VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \
|
||||
VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \
|
||||
VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \
|
||||
VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
\
|
||||
conv_type result; \
|
||||
dst_type dst0; \
|
||||
|
|
|
|||
|
|
@ -22,9 +22,11 @@ _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
|
|||
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvertYtoShortSub16_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateYShift_2x8;
|
||||
_viv_uniform VXC_512Bits uniCalculateUVShift_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;
|
||||
|
||||
#define NV12_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
|
||||
__kernel void pre_process_nv12_scale_##name##_gq \
|
||||
|
|
@ -85,14 +87,24 @@ __kernel void pre_process_nv12_scale_##name##_gq \
|
|||
VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
vxc_short8 tmpY; \
|
||||
vxc_char16 tmpUV; \
|
||||
short tmpVal = 128; \
|
||||
short tmpVal = 16; \
|
||||
VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \
|
||||
tmpVal = 128; \
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \
|
||||
\
|
||||
float4 tmpDstB, tmpDstG, tmpDstR; \
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \
|
||||
vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
|
||||
VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \
|
||||
VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \
|
||||
VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \
|
||||
VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
\
|
||||
conv_type result; \
|
||||
dst_type dst0; \
|
||||
|
|
@ -181,14 +193,24 @@ __kernel void pre_process_nv12_scale_##name \
|
|||
UV.s01234567 = UV.s10325476; \
|
||||
} \
|
||||
\
|
||||
vxc_short8 tmpY; \
|
||||
vxc_char16 tmpUV; \
|
||||
short tmpVal = 128; \
|
||||
short tmpVal = 16; \
|
||||
VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \
|
||||
tmpVal = 128; \
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \
|
||||
\
|
||||
float4 tmpDstB, tmpDstG, tmpDstR; \
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \
|
||||
vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
|
||||
VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \
|
||||
VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \
|
||||
VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \
|
||||
VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
\
|
||||
conv_type result; \
|
||||
dst_type dst0; \
|
||||
|
|
|
|||
|
|
@ -118,7 +118,7 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
|
|||
VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
int4 coord_out = coord; \
|
||||
int4 coord_out = coord.wwzw; \
|
||||
coord_out.xyw += rgb_order.xyz; \
|
||||
float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \
|
||||
rMean * r_scale * output_scale - output_zp, \
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;
|
|||
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractYUVtoShortSub_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;
|
||||
|
||||
#define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
|
||||
__kernel void pre_process_yuv422_copy_##name \
|
||||
|
|
@ -54,11 +55,21 @@ __kernel void pre_process_yuv422_copy_##name \
|
|||
} \
|
||||
\
|
||||
float4 tmpDstB, tmpDstG, tmpDstR; \
|
||||
vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
|
||||
vxc_short2 value = (vxc_short2)(128,16); \
|
||||
VXC_DP2x8(tmpYUV, YUV, value, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYUVtoShortSub_2x8); \
|
||||
VXC_DP4x4(tmpDstB, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \
|
||||
VXC_DP4x4(tmpDstG, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \
|
||||
VXC_DP4x4(tmpDstR, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \
|
||||
VXC_DP4x4(DstB_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
|
||||
uniConvertYUV422toB_4x4); \
|
||||
VXC_DP4x4(DstG_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
|
||||
uniConvertYUV422toG_4x4); \
|
||||
VXC_DP4x4(DstR_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
|
||||
uniConvertYUV422toR_4x4); \
|
||||
VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
\
|
||||
conv_type result; \
|
||||
dst_type dst0; \
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;
|
|||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractYtoShortSub16_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;
|
||||
|
||||
#define uyvy422 1
|
||||
|
||||
|
|
@ -70,8 +71,8 @@ __kernel void pre_process_yuv422_scale_##name \
|
|||
} \
|
||||
\
|
||||
int4 coord_Y = (int4)(sx.x + y_offset, sy, 0, 0); \
|
||||
int4 coord_U = (int4)((sx.x >> 1) * 2 + u_offset, sy, 0, 0); \
|
||||
int4 coord_V = (int4)((sx.x >> 1) * 2 + v_offset, sy, 0, 0); \
|
||||
int4 coord_U = (int4)((sx.x >> 2) * 4 + u_offset, sy, 0, 0); \
|
||||
int4 coord_V = (int4)((sx.x >> 2) * 4 + v_offset, sy, 0, 0); \
|
||||
\
|
||||
VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_Y.x = sx.y + y_offset; \
|
||||
|
|
@ -81,7 +82,7 @@ __kernel void pre_process_yuv422_scale_##name \
|
|||
coord_Y.x = sx.w + y_offset; \
|
||||
VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
sx = (sx >> 1) * 2 + u_offset; \
|
||||
sx = (sx >> 2) * 4 + u_offset; \
|
||||
VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_U.x = sx.y; \
|
||||
VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
|
|
@ -105,9 +106,19 @@ __kernel void pre_process_yuv422_scale_##name \
|
|||
VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
|
||||
\
|
||||
float4 tmpDstB, tmpDstG, tmpDstR; \
|
||||
VXC_DP4x4(tmpDstB, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \
|
||||
VXC_DP4x4(tmpDstG, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \
|
||||
VXC_DP4x4(tmpDstR, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \
|
||||
vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
|
||||
VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
|
||||
uniConvertYUV422toB_4x4); \
|
||||
VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
|
||||
uniConvertYUV422toG_4x4); \
|
||||
VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\
|
||||
uniConvertYUV422toR_4x4); \
|
||||
VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
|
||||
\
|
||||
conv_type result; \
|
||||
dst_type dst0; \
|
||||
|
|
|
|||
|
|
@ -21,8 +21,8 @@ __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \
|
|||
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
|
||||
src0_type src0; \
|
||||
src0_copy_type srcA; \
|
||||
src0_type src1; \
|
||||
src0_copy_type srcB; \
|
||||
src1_type src1; \
|
||||
src1_copy_type srcB; \
|
||||
VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, srcA, src0, 16); \
|
||||
VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
|
|
|
|||
|
|
@ -21,8 +21,8 @@ __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8( \
|
|||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
src0_type src0; \
|
||||
src0_copy_type srcA; \
|
||||
src0_type src1; \
|
||||
src0_copy_type srcB; \
|
||||
src1_type src1; \
|
||||
src1_copy_type srcB; \
|
||||
VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, srcA, src0, 16); \
|
||||
VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
|
|
|
|||
|
|
@ -28,37 +28,40 @@ __kernel void scatter_nd_update_reset_##name0##to##name1( \
|
|||
Image img1 = create_image_from_image2d(input_ref, size0); \
|
||||
Image img2 = create_image_from_image2d(temp_ref, size1); \
|
||||
Image img3 = create_image_from_image2d(temp_buf_int, 4); \
|
||||
__global ptr0* input_ptr = (__global ptr0*)img1.ptr; \
|
||||
__global ptr1* output_ptr = (__global ptr1*)img2.ptr; \
|
||||
__global int* tmp_update_ptr = (__global int*)img3.ptr; \
|
||||
ptr0 tmpData = input_ptr[gidx]; \
|
||||
int4 zeros = (int4)(0); \
|
||||
int loc2 = gidx * 8; \
|
||||
type0 src; \
|
||||
type1 tmpDst; \
|
||||
ptr1 dst; \
|
||||
vxc_ushort8 ms0; \
|
||||
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
|
||||
_viv_asm(COPY, src, tmpData, len0); \
|
||||
VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
|
||||
uniU8MulAndPostShift_0_Lo_2x8); \
|
||||
_viv_asm(COPY, dst, tmpDst, len1); \
|
||||
output_ptr[gidx] = dst; \
|
||||
vstore4(zeros, 0, tmp_update_ptr + loc2); \
|
||||
vstore4(zeros, 1, tmp_update_ptr + loc2); \
|
||||
if(gidx < res) \
|
||||
if(length > 0) \
|
||||
{ \
|
||||
__global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \
|
||||
__global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \
|
||||
ptr2 tmpData1 = input_ptr1[length + gidx]; \
|
||||
__global ptr0* input_ptr = (__global ptr0*)img1.ptr; \
|
||||
__global ptr1* output_ptr = (__global ptr1*)img2.ptr; \
|
||||
ptr0 tmpData = input_ptr[gidx]; \
|
||||
int4 zeros = (int4)(0); \
|
||||
int loc2 = gidx * 8; \
|
||||
ptr1 dst; \
|
||||
_viv_asm(COPY, src, tmpData, len0); \
|
||||
VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
|
||||
uniU8MulAndPostShift_0_Lo_2x8); \
|
||||
_viv_asm(COPY, dst, tmpDst, len1); \
|
||||
output_ptr[gidx] = dst; \
|
||||
vstore4(zeros, 0, tmp_update_ptr + loc2); \
|
||||
vstore4(zeros, 1, tmp_update_ptr + loc2); \
|
||||
} \
|
||||
__global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \
|
||||
__global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \
|
||||
for(int i = gidx; i < res; i += get_global_size(0)) \
|
||||
{ \
|
||||
ptr2 tmpData1 = input_ptr1[length + i]; \
|
||||
ptr3 dst1; \
|
||||
dst1 ^= dst1; \
|
||||
tmp_update_ptr[length + gidx] = 0; \
|
||||
tmp_update_ptr[length + i] = 0; \
|
||||
_viv_asm(COPY, src, tmpData1, 4); \
|
||||
VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
|
||||
uniU8MulAndPostShift_0_Lo_2x8); \
|
||||
_viv_asm(COPY, dst1, tmpDst, len3); \
|
||||
output_ptr1[length + gidx] = dst1; \
|
||||
output_ptr1[length + i] = dst1; \
|
||||
} \
|
||||
}
|
||||
SCATTER_RESET(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, 8, 8, 1, 1, uchar, uchar, 1)
|
||||
|
|
@ -246,14 +249,17 @@ __kernel void scatter_nd_update_copy_##src0_type( \
|
|||
int gidx = get_global_id(0); \
|
||||
Image img1 = create_image_from_image2d(temp_ref, element_size); \
|
||||
Image img2 = create_image_from_image2d(output, element_size); \
|
||||
__global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \
|
||||
__global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \
|
||||
output_ptr[gidx] = input_ptr[gidx]; \
|
||||
if(gidx < res) \
|
||||
if(length > 0) \
|
||||
{ \
|
||||
__global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \
|
||||
__global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \
|
||||
output_ptr1[length + gidx] = input_ptr1[length + gidx]; \
|
||||
__global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \
|
||||
__global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \
|
||||
output_ptr[gidx] = input_ptr[gidx]; \
|
||||
} \
|
||||
__global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \
|
||||
__global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \
|
||||
for(int i = gidx; i < res; i += get_global_size(0)) \
|
||||
{ \
|
||||
output_ptr1[length + i] = input_ptr1[length + i]; \
|
||||
} \
|
||||
}
|
||||
SCATTER_ND_UPDATE_COPY(U8, vxc_uchar8, 1, uchar)
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,261 +0,0 @@
|
|||
# to make ovxlib can compile both IDE and SKD
|
||||
# if you want to use IDE to compile : export USE_IDE_LIB=1
|
||||
# and VIVANTE_SDK_DIR=..../VeriSilicon/VivanteIDE5.4.0/cmdtools/vsimulator
|
||||
|
||||
###################################################################################
|
||||
#common parts
|
||||
# OBJECTS.
|
||||
|
||||
OBJECTS = $(OBJ_DIR)/vsi_nn_context.o \
|
||||
$(OBJ_DIR)/vsi_nn_client_op.o \
|
||||
$(OBJ_DIR)/vsi_nn_graph.o \
|
||||
$(OBJ_DIR)/vsi_nn_node_attr_template.o \
|
||||
$(OBJ_DIR)/vsi_nn_node.o \
|
||||
$(OBJ_DIR)/vsi_nn_ops.o \
|
||||
$(OBJ_DIR)/vsi_nn_daemon.o \
|
||||
$(OBJ_DIR)/vsi_nn_tensor.o \
|
||||
$(OBJ_DIR)/vsi_nn_version.o \
|
||||
$(OBJ_DIR)/vsi_nn_rnn.o \
|
||||
$(OBJ_DIR)/vsi_nn_rnn_helper.o \
|
||||
$(OBJ_DIR)/vsi_nn_internal_node.o \
|
||||
$(OBJ_DIR)/vsi_nn_log.o \
|
||||
$(OBJ_DIR)/vsi_nn_graph_optimization.o \
|
||||
$(OBJ_DIR)/vsi_nn_pre_post_process.o
|
||||
|
||||
vpath %.c utils
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_code_generator.o \
|
||||
$(OBJ_DIR)/vsi_nn_binary_tree.o \
|
||||
$(OBJ_DIR)/vsi_nn_map.o \
|
||||
$(OBJ_DIR)/vsi_nn_link_list.o \
|
||||
$(OBJ_DIR)/vsi_nn_math.o \
|
||||
$(OBJ_DIR)/vsi_nn_dtype_util.o \
|
||||
$(OBJ_DIR)/vsi_nn_shape_util.o \
|
||||
$(OBJ_DIR)/vsi_nn_dtype.o \
|
||||
$(OBJ_DIR)/vsi_nn_limits.o \
|
||||
$(OBJ_DIR)/vsi_nn_util.o \
|
||||
$(OBJ_DIR)/vsi_nn_dlfcn.o \
|
||||
$(OBJ_DIR)/vsi_nn_constraint_check.o \
|
||||
$(OBJ_DIR)/vsi_nn_hashmap.o \
|
||||
$(OBJ_DIR)/vsi_nn_tensor_op.o
|
||||
|
||||
vpath %.c quantization
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_dynamic_fixed_point.o \
|
||||
$(OBJ_DIR)/vsi_nn_asymmetric_affine.o \
|
||||
$(OBJ_DIR)/vsi_nn_perchannel_symmetric_affine.o
|
||||
|
||||
vpath %.c post
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \
|
||||
$(OBJ_DIR)/vsi_nn_post_cmupose.o
|
||||
|
||||
vpath %.c libnnext
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o \
|
||||
$(OBJ_DIR)/vsi_nn_vxkernel.o
|
||||
|
||||
vpath %.c cpu_backend
|
||||
SRCS += ${notdir ${wildcard cpu_backend/*.c}}
|
||||
|
||||
vpath %.c libnnext/ops/kernel
|
||||
SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}}
|
||||
|
||||
vpath %.c ops
|
||||
SRCS += ${notdir ${wildcard ops/*.c}}
|
||||
|
||||
vpath %.c kernel
|
||||
SRCS += ${notdir ${wildcard kernel/*.c}}
|
||||
|
||||
vpath %.c kernel/cl
|
||||
SRCS += ${notdir ${wildcard kernel/cl/*.c}}
|
||||
|
||||
vpath %.c kernel/cpu
|
||||
SRCS += ${notdir ${wildcard kernel/cpu/*.c}}
|
||||
|
||||
vpath %.c kernel/evis
|
||||
SRCS += ${notdir ${wildcard kernel/evis/*.c}}
|
||||
|
||||
vpath %.c kernel/vx
|
||||
SRCS += ${notdir ${wildcard kernel/vx/*.c}}
|
||||
|
||||
vpath %.c kernel/sp
|
||||
SRCS += ${notdir ${wildcard kernel/sp/*.c}}
|
||||
|
||||
vpath %.c custom/ops
|
||||
SRCS += ${notdir ${wildcard custom/ops/*.c}}
|
||||
|
||||
vpath %.c custom/ops/kernel/evis
|
||||
SRCS += ${notdir ${wildcard custom/ops/kernel/evis/*.c}}
|
||||
|
||||
vpath %.c custom/ops/kernel/cl
|
||||
SRCS += ${notdir ${wildcard custom/ops/kernel/cl/*.c}}
|
||||
|
||||
vpath %.c custom/ops/kernel/cpu
|
||||
SRCS += ${notdir ${wildcard custom/ops/kernel/cpu/*.c}}
|
||||
|
||||
vpath %.c custom/ops/kernel/sp
|
||||
SRCS += ${notdir ${wildcard custom/ops/kernel/sp/*.c}}
|
||||
|
||||
OBJECTS += ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)}
|
||||
|
||||
ifeq ($(USE_VIP_DEVICE),1)
|
||||
vpath %.cpp vip
|
||||
OBJECTS += $(OBJ_DIR)/virtual_device.o
|
||||
endif
|
||||
|
||||
################################################################################
|
||||
ifeq ($(USE_IDE_LIB),1)
|
||||
# IDE.
|
||||
|
||||
CC=$(CROSS_COMPILE)gcc
|
||||
|
||||
INCLUDES=-I. -I$(VIVANTE_SDK_DIR)/include/ \
|
||||
-I$(VIVANTE_SDK_DIR)/include/CL \
|
||||
-I$(VIVANTE_SDK_DIR)/include/VX \
|
||||
-I../include/ops -I../include/utils -I../include/inference \
|
||||
-I../include/client -I../include -I../include/libnnext \
|
||||
-I../include/cpu_backend \
|
||||
-I../src
|
||||
|
||||
ifeq (1,$(DEBUG))
|
||||
CFLAGS+=-g
|
||||
LFLAGS+=-g
|
||||
else
|
||||
CFLAGS+=-O3
|
||||
LFLAGS+=-O3
|
||||
endif
|
||||
CFLAGS += $(INCLUDES)
|
||||
CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror -Wno-strict-aliasing -Wno-maybe-uninitialized
|
||||
CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")))'
|
||||
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/lib \
|
||||
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy -lArchModelSw -lNNArchPerf
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/vsim \
|
||||
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux \
|
||||
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \
|
||||
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \
|
||||
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/../common/lib/ \
|
||||
-lvdtproxy
|
||||
LIBS += -lm -ldl
|
||||
|
||||
File = $(VIVANTE_SDK_DIR)/lib/libjpeg.a
|
||||
File2 = $(VIVANTE_SDK_DIR)/lib/x64_linux/libjpeg.a
|
||||
File3 = $(VIVANTE_SDK_DIR)/../common/lib/libjpeg.a
|
||||
ifeq ($(File),$(wildcard $(File)))
|
||||
LIBS+= $(File)
|
||||
else ifeq ($(File2),$(wildcard $(File2)))
|
||||
LIBS+= $(File2)
|
||||
else
|
||||
LIBS+= $(File3)
|
||||
endif
|
||||
|
||||
###################################################################################
|
||||
# Macros.
|
||||
CFLAGS += -fPIC
|
||||
DYNAMIC := 1
|
||||
TARGET_NAME = libovxlib.so
|
||||
OBJ_DIR=bin_r
|
||||
TARGET_OUTPUT = $(OBJ_DIR)/$(TARGET_NAME)
|
||||
|
||||
all: $(TARGET_OUTPUT)
|
||||
clean:
|
||||
@rm -rf $(OBJ_DIR)/* $(OBJ_DIR)
|
||||
|
||||
install: $(TARGET_OUTPUT)
|
||||
|
||||
################################################################################
|
||||
|
||||
LDFLAGS += -Wall -shared -Wl,-soname,$(TARGET_NAME) -Wl,-z,defs -fPIC
|
||||
|
||||
ifeq ($(USE_VIP_DEVICE),1)
|
||||
LDFLAGS += -pthread
|
||||
LIBS += -lstdc++
|
||||
INCLUDE += -I../include/vip
|
||||
$(OBJ_DIR)/virtual_device.o: virtual_device.cpp
|
||||
@echo " COMPILE $(abspath $<)"
|
||||
@mkdir -p $(OBJ_DIR)
|
||||
@$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $<
|
||||
endif
|
||||
|
||||
$(TARGET_OUTPUT): $(OBJECTS)
|
||||
@echo " LINK \033[1m$(notdir $@)\033[0m"
|
||||
@$(CC) $(LDFLAGS) $(OBJECTS) -o $(TARGET_OUTPUT) $(LIBS)
|
||||
|
||||
$(OBJ_DIR)/%.o: %.c
|
||||
@echo " COMPILE $(abspath $<)"
|
||||
@mkdir -p $(OBJ_DIR)
|
||||
@$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
else
|
||||
##################################################################################
|
||||
#SDK.
|
||||
|
||||
# include common definition.
|
||||
include $(AQROOT)/makefile.linux.def
|
||||
|
||||
#################################################################################
|
||||
INCLUDE += -I$(VIVANTE_SDK_INC) -I$(VIVANTE_SDK_INC)/HAL -I$(AQROOT)/sdk/inc
|
||||
INCLUDE += -I../include/ops -I../include/utils -I../include/inference
|
||||
INCLUDE += -I../include/client -I../include -I../include/libnnext
|
||||
INCLUDE += -I../include/cpu_backend
|
||||
INCLUDE += -I../src
|
||||
|
||||
CFLAGS += $(INCLUDE)
|
||||
CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror
|
||||
CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")))'
|
||||
|
||||
################################################################################
|
||||
# Supply necessary libraries.
|
||||
ifeq ($(USE_VXC_BINARY)$(USE_VSC_LITE),11)
|
||||
LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC_Lite -lGAL
|
||||
else
|
||||
LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC -lGAL
|
||||
endif
|
||||
LIBS += -lm -ldl
|
||||
|
||||
#############################################################################
|
||||
# Macros.
|
||||
ifeq ($(gcdSTATIC_LINK), 1)
|
||||
STATIC=1
|
||||
TARGET_NAME = libovxlib.a
|
||||
else
|
||||
CFLAGS += -fPIC
|
||||
DYNAMIC := 1
|
||||
TARGET_NAME = libovxlib.so
|
||||
endif
|
||||
|
||||
ifneq ("$(OVXLIB_CONFIG)", "")
|
||||
CFLAGS += -D$(OVXLIB_CONFIG)
|
||||
endif
|
||||
|
||||
ifneq ($(gcdSTATIC_LINK), 1)
|
||||
ifeq ($(VSI_GPERF_DEBUG), 1)
|
||||
TCMALLOC_DIR = $(OVXLIB_DIR)/third-party/gperftools
|
||||
CFLAGS += -I$(TCMALLOC_DIR)/src
|
||||
CFLAGS += -I$(TCMALLOC_DIR)/src/gperftools
|
||||
CFLAGS += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
|
||||
CFLAGS += -g
|
||||
LIBS += -L$(TCMALLOC_DIR)/.libs -ltcmalloc
|
||||
endif
|
||||
endif
|
||||
#############################################################################
|
||||
|
||||
# installation directory
|
||||
INSTALL_DIR := $(VIVANTE_SDK_LIB)
|
||||
|
||||
################################################################################
|
||||
# Include the common makefile.
|
||||
|
||||
ifeq ($(USE_VIP_DEVICE),1)
|
||||
LDFLAGS += -pthread
|
||||
LIBS += -lstdc++
|
||||
INCLUDE += -I../include/vip
|
||||
$(OBJ_DIR)/virtual_device.o: virtual_device.cpp
|
||||
@echo " COMPILE $(abspath $<)"
|
||||
@mkdir -p $(OBJ_DIR)
|
||||
@$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $<
|
||||
endif
|
||||
|
||||
include $(AQROOT)/common.target
|
||||
endif
|
||||
|
|
@ -234,6 +234,7 @@ static vsi_bool op_check
|
|||
IO_TYPE(D_F32, D_I16)
|
||||
IO_TYPE(D_F16, D_I32)
|
||||
IO_TYPE(D_I32, D_I32)
|
||||
IO_TYPE(D_I32, D_I16)
|
||||
IO_TYPE(D_I8|Q_DFP, D_I32)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_I32)
|
||||
IO_TYPE(D_I8|Q_ASYM, D_U8)
|
||||
|
|
|
|||
|
|
@ -299,7 +299,7 @@ static vsi_bool op_setup
|
|||
}
|
||||
ret = vsi_nn_op_common_setup(self, inputs, outputs);
|
||||
|
||||
if ( _is_dataconvert_op(self, inputs, outputs) )
|
||||
if ( _is_dataconvert_op(self, inputs, outputs) && ret )
|
||||
{
|
||||
vsi_nn_internal_node_t* curr = NULL;
|
||||
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
|
||||
|
|
|
|||
|
|
@ -128,6 +128,9 @@ static vsi_bool _is_tensorview_support
|
|||
#ifdef VSI_CONCAT_ENHANCE_SUPPORT
|
||||
// Driver support concat optimize in all dimensions.
|
||||
ret = TRUE;
|
||||
|
||||
VSI_UNREFERENCED(self);
|
||||
VSI_UNREFERENCED(outputs);
|
||||
#else
|
||||
/*
|
||||
If the concat op need to be optimized to tensor view, the memory must be continues.
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
#include <string.h>
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_node.h"
|
||||
|
|
@ -216,8 +217,11 @@ static vsi_bool op_setup
|
|||
if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt &&
|
||||
VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type )
|
||||
{
|
||||
vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL );
|
||||
inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW;
|
||||
if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed)
|
||||
{
|
||||
vsi_nn_TransposeTensor(self->graph, inputs[1], perm, 4, NULL);
|
||||
inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef VX_CONVERT_POLICY_WRAP_ENABLE
|
||||
|
|
@ -227,6 +231,8 @@ static vsi_bool op_setup
|
|||
}
|
||||
#endif
|
||||
|
||||
((vsi_nn_tensor_prv_t*)inputs[1])->processed = TRUE;
|
||||
|
||||
nn_param = &self->nn_param.conv2d;
|
||||
|
||||
vsi_nn_compute_padding(
|
||||
|
|
|
|||
|
|
@ -248,6 +248,7 @@ static vsi_bool op_check
|
|||
IO_TYPE(D_BOOL8, D_I16|Q_DFP)
|
||||
IO_TYPE(D_BOOL8, D_I16|Q_ASYM)
|
||||
IO_TYPE(D_BOOL8, D_I16|Q_SYM)
|
||||
IO_TYPE(D_BOOL8, D_F16)
|
||||
IO_TYPE(D_BOOL8, D_I32)
|
||||
IO_TYPE(D_BOOL8, D_U16)
|
||||
IO_TYPE(D_BOOL8, D_U32)
|
||||
|
|
@ -258,6 +259,7 @@ static vsi_bool op_check
|
|||
IO_TYPE(D_I16|Q_DFP, D_BOOL8)
|
||||
IO_TYPE(D_I16|Q_ASYM, D_BOOL8)
|
||||
IO_TYPE(D_I16|Q_SYM, D_BOOL8)
|
||||
IO_TYPE(D_F16, D_BOOL8)
|
||||
IO_TYPE(D_I32, D_BOOL8)
|
||||
IO_TYPE(D_U16, D_BOOL8)
|
||||
IO_TYPE(D_U32, D_BOOL8)
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
#include <stdlib.h>
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_node.h"
|
||||
|
|
@ -410,8 +411,11 @@ static vsi_bool op_setup
|
|||
* */
|
||||
if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt )
|
||||
{
|
||||
vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL );
|
||||
inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW;
|
||||
if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed)
|
||||
{
|
||||
vsi_nn_TransposeTensor(self->graph, inputs[1], perm, 4, NULL);
|
||||
inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef VX_CONVERT_POLICY_WRAP_ENABLE
|
||||
|
|
@ -424,22 +428,30 @@ static vsi_bool op_setup
|
|||
#ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS
|
||||
if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 && TRUE == inputs[1]->attr.is_const)
|
||||
{
|
||||
/* whnc->whcn */
|
||||
vsi_nn_PermuteTensor( self->graph, inputs[1], perm1, 4 );
|
||||
if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed) {
|
||||
/* whnc->whcn */
|
||||
vsi_nn_PermuteTensor(self->graph, inputs[1], perm1, 4);
|
||||
}
|
||||
}
|
||||
/* Rotate 180 degrees for weights data */
|
||||
if (TRUE == inputs[1]->attr.is_const)
|
||||
{
|
||||
vsi_nn_reshuffle_weight_data(self->graph, inputs[1]);
|
||||
if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed) {
|
||||
vsi_nn_reshuffle_weight_data(self->graph, inputs[1]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) >= 0 && TRUE == inputs[1]->attr.is_const)
|
||||
{
|
||||
/* whcn->whnc */
|
||||
vsi_nn_PermuteTensor( self->graph, inputs[1], perm1, 4 );
|
||||
if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed) {
|
||||
vsi_nn_PermuteTensor(self->graph, inputs[1], perm1, 4);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
((vsi_nn_tensor_prv_t*)inputs[1])->processed = TRUE;
|
||||
|
||||
nn_param = &self->nn_param.deconv;
|
||||
|
||||
nn_param->group = ( 0 == nn_param->group ) ? 1 : nn_param->group;
|
||||
|
|
|
|||
|
|
@ -50,36 +50,12 @@ static vsi_status op_compute
|
|||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_param_t * param = NULL;
|
||||
vsi_nn_kernel_node_t n = NULL;
|
||||
uint32_t i = 0;
|
||||
vsi_size_t block_size = 1, block_num = 1, axis_num = 0, indices_num = 1;
|
||||
int32_t axis = self->nn_param.gather.axis;
|
||||
int32_t batch_dims = self->nn_param.gather.batch_dims;
|
||||
vsi_size_t *input_size = inputs[0]->attr.size;
|
||||
uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num;
|
||||
uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : inputs[1]->attr.dim_num;
|
||||
|
||||
param = vsi_nn_kernel_param_create();
|
||||
|
||||
for (i = 0; i < (uint32_t)axis; ++i)
|
||||
{
|
||||
block_size *= input_size[i];
|
||||
}
|
||||
|
||||
axis_num = input_size[axis];
|
||||
for (i = axis + 1; i < r_rank - batch_dims; ++i)
|
||||
{
|
||||
block_num *= input_size[i];
|
||||
}
|
||||
for (i = 0; i < q_rank - batch_dims; ++i)
|
||||
{
|
||||
indices_num *= inputs[1]->attr.size[i];
|
||||
}
|
||||
|
||||
vsi_nn_kernel_param_add_int32( param, "block_size", (int32_t)block_size );
|
||||
vsi_nn_kernel_param_add_int32( param, "block_num", (int32_t)block_num );
|
||||
vsi_nn_kernel_param_add_int32( param, "axis_num", (int32_t)axis_num );
|
||||
vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis );
|
||||
vsi_nn_kernel_param_add_int32( param, "indices_num", (int32_t)indices_num );
|
||||
vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims );
|
||||
|
||||
if (vsi_nn_is_same_data_type(inputs[0], outputs[0]) == FALSE ||
|
||||
|
|
|
|||
|
|
@ -234,6 +234,10 @@ static vsi_bool op_setup_default
|
|||
{
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
}
|
||||
else if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16)
|
||||
{
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_BFLOAT16;
|
||||
}
|
||||
else
|
||||
{
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
|
||||
|
|
|
|||
|
|
@ -374,6 +374,17 @@ static vsi_bool op_setup
|
|||
}
|
||||
}
|
||||
|
||||
for ( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++)
|
||||
{
|
||||
if (inputs[LSTMUNIT_INPUT_WEIGHT_I2I + i] != NULL
|
||||
&& p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i].qnt_type == VSI_NN_QNT_TYPE_NONE
|
||||
&& p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i].vx_type == VSI_NN_TYPE_NONE
|
||||
&& inputs[LSTMUNIT_INPUT_WEIGHT_I2I + i]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16)
|
||||
{
|
||||
p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i] = inputs[LSTMUNIT_INPUT_WEIGHT_I2I + i]->attr.dtype;
|
||||
}
|
||||
}
|
||||
|
||||
/* Input FC */
|
||||
if( is_input_fc_on_tp )
|
||||
{
|
||||
|
|
|
|||
|
|
@ -54,21 +54,12 @@ static vsi_status op_compute
|
|||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_param_t *param = NULL;
|
||||
vsi_nn_kernel_node_t n = NULL;
|
||||
vsi_nn_tensor_t * tmp_inputs[2] = {NULL};
|
||||
vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
|
||||
uint32_t new_rank[3] = {0};
|
||||
vsi_bool ret = FALSE;
|
||||
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
|
||||
int32_t transposeA = self->nn_param.matrixmul.transpose[0];
|
||||
int32_t transposeB = self->nn_param.matrixmul.transpose[1];
|
||||
int32_t adjointA = self->nn_param.matrixmul.adjoint[0];
|
||||
int32_t adjointB = self->nn_param.matrixmul.adjoint[1];
|
||||
|
||||
uint32_t cross_flg = 0;
|
||||
uint32_t size_axis_inner_outer[3] = {0};
|
||||
uint32_t stride_axis_inner_outer[9] = {0};
|
||||
|
||||
param = vsi_nn_kernel_param_create();
|
||||
|
||||
vsi_nn_kernel_param_add_int32( param, "transposeA", transposeA );
|
||||
|
|
@ -76,52 +67,18 @@ static vsi_status op_compute
|
|||
vsi_nn_kernel_param_add_int32( param, "adjointA", adjointA );
|
||||
vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB );
|
||||
|
||||
|
||||
ret = vsi_nn_kernel_optimize_matrixmul_broadcast_shape(
|
||||
inputs[0]->attr.size,
|
||||
inputs[1]->attr.size,
|
||||
outputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num,
|
||||
inputs[1]->attr.dim_num,
|
||||
outputs[0]->attr.dim_num,
|
||||
shapes[0], shapes[1], shapes[2], new_rank,
|
||||
&cross_flg, size_axis_inner_outer, stride_axis_inner_outer);
|
||||
|
||||
if (ret)
|
||||
{
|
||||
vsi_nn_kernel_param_add_int32( param, "cross_flg", cross_flg );
|
||||
vsi_nn_kernel_param_add_buffer( param, "size_axis_inner_outer", size_axis_inner_outer, 3);
|
||||
vsi_nn_kernel_param_add_buffer( param, "stride_axis_inner_outer", stride_axis_inner_outer, 9);
|
||||
|
||||
tmp_inputs[0] = vsi_nn_reshape_tensor(self->graph, inputs[0], shapes[0], new_rank[0]);
|
||||
tmp_inputs[1] = vsi_nn_reshape_tensor(self->graph, inputs[1], shapes[1], new_rank[1]);
|
||||
tmp_outputs[0] = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes[2], new_rank[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
VSILOGE("illegal inputs shape");
|
||||
status = VSI_FAILURE;
|
||||
goto final;
|
||||
}
|
||||
|
||||
|
||||
n = vsi_nn_kernel_selector( self->graph, "matrixmul", tmp_inputs, 2, tmp_outputs, 1, param );
|
||||
n = vsi_nn_kernel_selector( self->graph, "matrixmul", inputs, 2, outputs, 1, param );
|
||||
if ( n != NULL )
|
||||
{
|
||||
self->n = (vx_node)n;
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
final:
|
||||
if (param != NULL)
|
||||
{
|
||||
vsi_nn_kernel_param_release( ¶m );
|
||||
}
|
||||
|
||||
vsi_safe_release_tensor( tmp_inputs[0] );
|
||||
vsi_safe_release_tensor( tmp_inputs[1] );
|
||||
vsi_safe_release_tensor( tmp_outputs[0] );
|
||||
|
||||
return status;
|
||||
} /* op_compute() */
|
||||
|
||||
|
|
|
|||
|
|
@ -74,6 +74,20 @@ static vsi_bool op_check
|
|||
return ret;
|
||||
} /* op_check() */
|
||||
|
||||
static vsi_status op_init
|
||||
(
|
||||
vsi_nn_node_t * self
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
|
||||
self->nn_param.max_pool3d.dilation[0] = 1;
|
||||
self->nn_param.max_pool3d.dilation[1] = 1;
|
||||
self->nn_param.max_pool3d.dilation[2] = 1;
|
||||
|
||||
return status;
|
||||
} /* op_init() */
|
||||
|
||||
static vsi_status op_optimize
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
|
|
@ -120,7 +134,7 @@ static vsi_bool op_setup
|
|||
inputs[0]->attr.size,
|
||||
ksize,
|
||||
p->stride,
|
||||
NULL,
|
||||
p->dilation,
|
||||
p->pad_type,
|
||||
pad
|
||||
);
|
||||
|
|
@ -142,7 +156,7 @@ static vsi_bool op_setup
|
|||
p->ksize[0],
|
||||
&p->pad[0],
|
||||
p->stride[0],
|
||||
0,
|
||||
p->dilation[0],
|
||||
p->round_type
|
||||
);
|
||||
|
||||
|
|
@ -152,7 +166,7 @@ static vsi_bool op_setup
|
|||
p->ksize[1],
|
||||
&p->pad[2],
|
||||
p->stride[1],
|
||||
0,
|
||||
p->dilation[1],
|
||||
p->round_type
|
||||
);
|
||||
|
||||
|
|
@ -162,7 +176,7 @@ static vsi_bool op_setup
|
|||
p->ksize[2],
|
||||
&p->pad[4],
|
||||
p->stride[2],
|
||||
0,
|
||||
p->dilation[2],
|
||||
p->round_type
|
||||
);
|
||||
|
||||
|
|
@ -210,6 +224,8 @@ static vsi_bool op_setup
|
|||
curr->node->nn_param.pool.pad[1] = p->pad[1];
|
||||
curr->node->nn_param.pool.pad[2] = p->pad[2];
|
||||
curr->node->nn_param.pool.pad[3] = p->pad[3];
|
||||
curr->node->nn_param.pool.dilation[0] = p->dilation[0];
|
||||
curr->node->nn_param.pool.dilation[1] = p->dilation[1];
|
||||
curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX;
|
||||
curr->node->nn_param.pool.round_type = p->round_type;
|
||||
curr->node->nn_param.pool.pad_type = p->pad_type;
|
||||
|
|
@ -265,6 +281,8 @@ static vsi_bool op_setup
|
|||
curr->node->nn_param.pool.pad[1] = 0;
|
||||
curr->node->nn_param.pool.pad[2] = p->pad[4];
|
||||
curr->node->nn_param.pool.pad[3] = p->pad[5];
|
||||
curr->node->nn_param.pool.dilation[0] = 1;
|
||||
curr->node->nn_param.pool.dilation[1] = p->dilation[2];
|
||||
curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX;
|
||||
curr->node->nn_param.pool.round_type = p->round_type;
|
||||
curr->node->nn_param.pool.pad_type = p->pad_type;
|
||||
|
|
@ -305,7 +323,7 @@ __BEGIN_DECLS
|
|||
DEF_OP_REG
|
||||
(
|
||||
/* op_name */ MAX_POOL3D,
|
||||
/* init */ NULL,
|
||||
/* init */ op_init,
|
||||
/* compute */ op_compute,
|
||||
/* deinit */ op_deinit,
|
||||
/* check */ op_check,
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue