Updata internal ovxlib to rel1.2.6 (#690)

Internal ovxlib commit hash: c5d3e69356579fc7b595a7c0939fc7e4e0aaab5a

Type: Code Improvement

Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
Chen Feiyue 2024-04-02 19:56:46 +08:00 committed by GitHub
parent 3ea908ca6d
commit 3b80968fb1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
69 changed files with 4878 additions and 590 deletions

View File

@ -1 +1 @@
1.1.50
1.2.6

2
src/tim/vx/internal/include/interface/ops.def Executable file → Normal file
View File

@ -197,3 +197,5 @@ DEF_OP(RESIZE_3D)
DEF_OP(REDUCEL2)
DEF_OP(CROP_AND_RESIZE)
DEF_OP(TAN)
DEF_OP(RMSNORM)
DEF_OP(SHAPE)

View File

@ -26,6 +26,8 @@
#define _VSI_NN_KERNEL_H
#include <stdint.h>
#include <stdarg.h>
#include "vsi_nn_log.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_graph.h"
@ -81,6 +83,7 @@ typedef enum
U4,
FP8_E4M3,
FP8_E5M2,
INVALID_DTYPE,
} VSI_PUBLIC_TYPE vsi_nn_kernel_dtype_e;
typedef enum
@ -532,9 +535,8 @@ static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
return FP8_E5M2;
default:
VSILOGE("error data type %d", dtype);
break;
return INVALID_DTYPE;
}
return I8;
} /* vsi_nn_kernel_map_dtype() */
static VSI_INLINE_API vsi_nn_type_e vsi_nn_dtype_map_kernel

View File

@ -43,6 +43,7 @@ typedef struct _vsi_nn_resize_internal_param
vsi_bool half_pixel_centers;
float factor;
vsi_enum layout;
vsi_enum type;
} vsi_nn_resize_internal_param;
#ifdef __cplusplus

View File

@ -0,0 +1,54 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_RMSNORM_H
#define _VSI_NN_OP_RMSNORM_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _rmsnorm_local_data_t {
int32_t placeholder;
} rmsnorm_local_data_t;
typedef struct _vsi_nn_rmsnorm_param
{
struct _rmsnorm_local_data_t* local;
float eps;
int32_t axis;
} vsi_nn_rmsnorm_param;
_compiler_assert(offsetof(vsi_nn_rmsnorm_param, local) == 0, \
vsi_nn_rmsnorm_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,47 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_SHAPE_H
#define _VSI_NN_OP_SHAPE_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_shape_param
{
struct _shape_local_data_t* local;
// Add parameters here
} vsi_nn_shape_param;
_compiler_assert(offsetof(vsi_nn_shape_param, local) == 0, \
vsi_nn_shape_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -33,6 +33,14 @@
extern "C" {
#endif
/*
* A helper union for fp32 bit casting.
*/
typedef union {
float val;
uint32_t data;
} fp32_bit_cast_t;
static VSI_INLINE_API vsi_bool type_is_integer
(
const vsi_nn_type_e type
@ -203,9 +211,11 @@ static VSI_INLINE_API vsi_bool fp32_is_inf
float val
)
{
uint32_t u_value = *(uint32_t*)&val;
fp32_bit_cast_t fp32_bit_cast;
fp32_bit_cast.val = val;
uint32_t fp32_data = fp32_bit_cast.data;
if ((u_value & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
if ((fp32_data & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
{
return TRUE;
}
@ -232,7 +242,9 @@ static VSI_INLINE_API int32_t fp32_to_affine
if (fp32_is_inf(in) != 0)
{
uint32_t sign = (*(uint32_t*)&in) >> 31;
fp32_bit_cast_t fp32_bit_cast;
fp32_bit_cast.val = in;
uint32_t sign = fp32_bit_cast.data >> 31;
data = sign == 1 ? (int32_t)min_range : (int32_t)max_range;
}
@ -277,7 +289,9 @@ static VSI_INLINE_API int32_t fp32_to_dfp
if (fp32_is_inf(in) != 0)
{
uint32_t sign = (*(uint32_t*)&in) >> 31;
fp32_bit_cast_t fp32_bit_cast;
fp32_bit_cast.val = in;
uint32_t sign = fp32_bit_cast.data >> 31;
data = sign == 1 ? (int32_t)min_range : (int32_t) max_range;
}
@ -373,8 +387,9 @@ static VSI_INLINE_API float bfp16_to_fp32
int16_t in
)
{
int32_t t1, t2, t3;
uint32_t t1, t2, t3;
float out;
fp32_bit_cast_t fp32_bit_cast;
t1 = in & 0x00FF; // Mantissa
t2 = in & 0xFF00; // Sign bit + Exponent
@ -384,9 +399,10 @@ static VSI_INLINE_API float bfp16_to_fp32
t2 <<= 16; // Shift (sign + Exponent) bit into position
t1 |= t2; // Re-insert (sign + Exponent) bit
*((uint32_t*)&out) = t1;
fp32_bit_cast.data = t1;
out = fp32_bit_cast.val;
return t3 == 0 ? 0 : out;
return t3 == 0 ? 0.0f : out;
} /* bfp16_to_fp32() */
static VSI_INLINE_API uint16_t fp32_to_fp16
@ -394,10 +410,12 @@ static VSI_INLINE_API uint16_t fp32_to_fp16
float in
)
{
uint32_t fp32 = *((uint32_t *) &in);
uint32_t t1 = (fp32 & 0x80000000u) >> 16; /* sign bit. */
uint32_t t2 = (fp32 & 0x7F800000u) >> 13; /* Exponent bits */
uint32_t t3 = (fp32 & 0x007FE000u) >> 13; /* Mantissa bits, no rounding */
fp32_bit_cast_t fp32_bit_cast;
fp32_bit_cast.val = in;
uint32_t fp32_data = fp32_bit_cast.data;
uint32_t t1 = (fp32_data & 0x80000000u) >> 16; /* sign bit. */
uint32_t t2 = (fp32_data & 0x7F800000u) >> 13; /* Exponent bits */
uint32_t t3 = (fp32_data & 0x007FE000u) >> 13; /* Mantissa bits, no rounding */
uint32_t fp16 = 0u;
if( t2 >= 0x023c00u )
{
@ -420,8 +438,10 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16
float in
)
{
uint32_t fp32 = *((unsigned int *) &in);
uint32_t t1 = fp32 >> 16;
fp32_bit_cast_t fp32_bit_cast;
fp32_bit_cast.val = in;
uint32_t fp32_data = fp32_bit_cast.data;
uint32_t t1 = fp32_data >> 16;
return (uint16_t) t1;
} /* fp32_to_bfp16() */
@ -435,10 +455,12 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
Convert a float point to bfloat16, with round-nearest-to-even as rounding method.
*/
uint32_t fp32 = *((unsigned int *) &in);
fp32_bit_cast_t fp32_bit_cast;
fp32_bit_cast.val = in;
uint32_t fp32_data = fp32_bit_cast.data;
uint16_t out;
uint32_t lsb = (fp32 >> 16) & 1; /* Least significant bit of resulting bfloat. */
uint32_t lsb = (fp32_data >> 16) & 1; /* Least significant bit of resulting bfloat. */
uint32_t rounding_bias = 0x7fff + lsb;
if ( VSI_NN_FLOAT32_NAN == in )
@ -447,8 +469,8 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
}
else
{
fp32 += rounding_bias;
out = (uint16_t) (fp32 >> 16);
fp32_data += rounding_bias;
out = (uint16_t) (fp32_data >> 16);
}
return out;
@ -466,7 +488,9 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
float fp8_f32 = in / scale;
int32_t in_val = *((int32_t*)&fp8_f32);
fp32_bit_cast_t fp32_bit_cast;
fp32_bit_cast.val = fp8_f32;
uint32_t in_val = fp32_bit_cast.data;
uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
@ -512,7 +536,9 @@ static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) {
float fp8_f32 = in / scale;
int32_t in_val = *((int32_t*)&fp8_f32);
fp32_bit_cast_t fp32_bit_cast;
fp32_bit_cast.val = fp8_f32;
uint32_t in_val = fp32_bit_cast.data;
uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
uint32_t in_man = (in_val & 0x7FFFFF); /* low 23 bits is man */
@ -561,6 +587,7 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
uint32_t exponentOut = 0;
uint32_t mantissaOut = 0;
uint32_t out_u = 0;
fp32_bit_cast_t fp32_bit_cast;
{
uint32_t signIn;
@ -610,7 +637,8 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
}
final:
out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
val_fp32 = *((float*)&out_u);
fp32_bit_cast.data = out_u;
val_fp32 = fp32_bit_cast.val;
return val_fp32 * scale;
} /* fp8_e4m3_to_fp32() */
@ -621,6 +649,7 @@ static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
uint32_t exponentOut = 0;
uint32_t mantissaOut = 0;
uint32_t out_u = 0;
fp32_bit_cast_t fp32_bit_cast;
{
uint32_t signIn;
@ -670,7 +699,8 @@ static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
}
final:
out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
val_fp32 = *((float*)&out_u);
fp32_bit_cast.data = out_u;
val_fp32 = fp32_bit_cast.val;
return val_fp32 * scale;
} /* fp8_e5m2_to_fp32() */

View File

@ -60,9 +60,7 @@ typedef struct _vsi_nn_hw_config_t
{
char target_name[VSI_NN_MAX_TARGET_NAME];
vsi_nn_hw_evis_t evis;
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
uint32_t subGroupSize;
#endif
uint32_t use_40bits_va;
uint32_t support_stream_processor;
uint32_t sp_exec_count;

View File

@ -210,6 +210,8 @@
#include "ops/vsi_nn_op_resize_3d.h"
#include "ops/vsi_nn_op_reducel2.h"
#include "ops/vsi_nn_op_crop_and_resize.h"
#include "ops/vsi_nn_op_rmsnorm.h"
#include "ops/vsi_nn_op_shape.h"
/* custom node head define define */
#include "custom/vsi_nn_custom_node_type.h"
#include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -408,6 +410,8 @@ typedef union _vsi_nn_nn_param
vsi_nn_resize_3d_param resize_3d;
vsi_nn_reducel2_param reducel2;
vsi_nn_crop_and_resize_param crop_and_resize;
vsi_nn_rmsnorm_param rmsnorm;
vsi_nn_shape_param shape;
void* client_param;
/* custom node data struct define */

View File

@ -33,7 +33,7 @@ extern "C"{
#define VSI_NN_VERSION_MAJOR 1
#define VSI_NN_VERSION_MINOR 2
#define VSI_NN_VERSION_PATCH 2
#define VSI_NN_VERSION_PATCH 6
#define VSI_NN_VERSION \
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

View File

@ -267,7 +267,7 @@ static vsi_nn_kernel_node_t _setup
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
VSI_UNREFERENCED(params);

View File

@ -55,8 +55,17 @@ static vsi_status op_compute
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_custom_warp_affine_param * p;
p = &(self->nn_param.custom_warp_affine);
float matrix_shape[6] = { 1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f};
param = vsi_nn_kernel_param_create();
//Unlike OpenCV, we use the coordinate of dst and matrix to calculate the coordinate of src in custom_warp_affine.
//Therefore, matrix M_ovx in custom_warp_affine is different from matrix M_cv in OpenCV.
//We get M_ovx by transposing the inverse of M_cv.
//inv_M = cv2.invertAffineTransform(M_cv); M_ovx=inv_M.transpose(1,0)
if (p->matrix == NULL)
{
p->matrix = matrix_shape;
}
vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 );
vsi_nn_kernel_param_add_int32( param, "type", p->type);
vsi_nn_kernel_param_add_int32( param, "rgb_type", p->rgb_type);

View File

@ -47,18 +47,26 @@ typedef enum
} _internal_kernel_e;
#define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE() "bilinear_grid_sample"
#define _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE() "bilinear_grid_sample_reflect"
#define STR(a) #a
// Add kernel hashtable here
#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, REFLECT) \
((IN1_DTYPE << 24) | (IN0_DTYPE << 16) | (OUT_DTYPE << 8) | (REFLECT))
#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
{ \
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
_BILINEAR_GRID_SAMPLE_KERNEL_SOURCE() \
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
_BILINEAR_GRID_SAMPLE_KERNEL_SOURCE() \
}
#define PACK_REFLECT_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
{ \
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_reflect_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
_BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE() \
}
typedef struct
@ -73,6 +81,8 @@ static const _kernel_map_type _bilinear_grid_sample_kernel_map[] =
// Register kernel here
PACK_KERNEL_MAP(F32, F32, F32 ),
PACK_KERNEL_MAP(U8, U8, U8),
PACK_REFLECT_KERNEL_MAP(F32, F32, F32),
PACK_REFLECT_KERNEL_MAP(U8, U8, U8),
};
@ -95,23 +105,24 @@ static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _BILINEAR_GRID_SAMPLE_PARAM_NUM 8
#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM \
#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM 14
#define _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_NUM 12
#define _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM \
_cnt_of_array(_bilinear_grid_sample_kernel_param_def)
#define SCALAR_HALF_INPUT0_W (3)
#define SCALAR_HALF_INPUT0_H (4)
#define SCALAR_ADD_VALUE_W (5)
#define SCALAR_ADD_VALUE_H (6)
#define SCALAR_DEPTH (7)
#define SCALAR_INPUT0_SCALE (8)
#define SCALAR_INPUT0_TAIL (9)
#define SCALAR_INPUT1_SCALE (10)
#define SCALAR_INPUT1_TAIL (11)
#define SCALAR_OUTPUT_SCALE (12)
#define SCALAR_OUTPUT_TAIL (13)
/*
* Kernel initializer
@ -170,7 +181,8 @@ static vsi_status _query_kernel
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_bool* is_use_u8_kernel
vsi_bool* is_use_u8_kernel,
int32_t is_reflect_mode
)
{
vsi_status status = VSI_FAILURE;
@ -199,14 +211,29 @@ static vsi_status _query_kernel
out_dtype = F32;
}
if ((U8 == in0_dtype) || (U8 == out_dtype)) {
param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
if (is_reflect_mode)
{
param_def_size = _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM;
}
else{
param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
}
*is_use_u8_kernel = TRUE;
} else {
param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
if (is_reflect_mode)
{
param_def_size = _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_NUM;
}
else
{
param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
}
*is_use_u8_kernel = FALSE;
}
key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, is_reflect_mode);
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
@ -245,7 +272,7 @@ static vsi_nn_kernel_node_t _setup
{
vsi_nn_kernel_node_t node = NULL;
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM];
vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM];
vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
uint32_t final_in1_rank = 0;
vsi_nn_tensor_t* rs_tensors = NULL;
@ -263,11 +290,14 @@ static vsi_nn_kernel_node_t _setup
vsi_bool is_use_u8_kernel = FALSE;
int32_t align_corners =
vsi_nn_kernel_param_get_int32(params, "align_corners");
int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
uint32_t pad_val = 0;
int32_t depth = 0;
vsi_nn_kernel_dtype_e in0_dtype;
float half_input0_w, half_input0_h, add_float_value_w, add_float_value_h;
int32_t is_reflect_mode = 0;
float min_val_w, span_w, min_val_h, span_h;
// Check if gpu can support the size
if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
@ -280,6 +310,11 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
{
is_reflect_mode = 1;
}
final_tensors[0] = inputs[0];
if (inputs[1]->attr.dim_num >= 3) {
@ -313,12 +348,35 @@ static vsi_nn_kernel_node_t _setup
add_float_value_h = half_input0_h - 0.5f;
}
if (is_reflect_mode)
{
float low_w, low_h, high_w, high_h;
if (align_corners)
{
low_w = 0;
low_h = 0;
high_w = 2 * (float)(in0_width - 1);
high_h = 2 * (float)(in0_height - 1);
}
else
{
low_w = -1;
low_h = -1;
high_w = 2 * (float)in0_width - 1;
high_h = 2 * (float)in0_height - 1;
}
min_val_w = low_w / 2;
span_w = (high_w - low_w) / 2;
min_val_h = low_h / 2;
span_h = (high_h - low_h) / 2;
}
depth = (int32_t)inputs[0]->attr.size[2];
in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
if (U8 == in0_dtype) {
pad_val = inputs[0]->attr.dtype.zero_point;
}
status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel);
status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel, is_reflect_mode);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
@ -326,7 +384,7 @@ static vsi_nn_kernel_node_t _setup
{
size_t node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM,
vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM,
final_tensors, input_num, &final_tensors[2], output_num );
node_params[SCALAR_HALF_INPUT0_W] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_w );
node_params[SCALAR_HALF_INPUT0_H] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_h );
@ -335,13 +393,19 @@ static vsi_nn_kernel_node_t _setup
node_params[SCALAR_DEPTH] = vsi_nn_kernel_scalar_create( graph, I32, &depth );
if (is_use_u8_kernel)
{
node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail );
node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail );
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail );
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail );
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
}
if (is_reflect_mode)
{
node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &min_val_w);
node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &span_w);
node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &min_val_h);
node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &span_h);
}
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
@ -351,19 +415,34 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_W]);
vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_H]);
vsi_nn_kernel_scalar_release(&node_params[SCALAR_DEPTH]);
node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
if (is_use_u8_kernel) {
vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_SCALE]);
vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_TAIL]);
vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_SCALE]);
vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_TAIL]);
vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_SCALE]);
vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_TAIL]);
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
}
if (is_reflect_mode)
{
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
}
{
// Set default border mode.
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U32 = pad_val;
if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
{
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U32 = pad_val;
}
else
{
border.mode = VX_BORDER_REPLICATE;
}
status = vxSetNodeAttribute(
(vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
CHECK_STATUS(status);

View File

@ -244,7 +244,10 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(input_attr0);
SAFE_FREE_TENSOR_ATTR(input_attr1);
SAFE_FREE_TENSOR_ATTR(output_attr);
return status;
} /* _gather_elements_initializer() */

View File

@ -34,6 +34,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util_prv.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
@ -1489,8 +1490,8 @@ static vsi_nn_kernel_node_t _setup
float twoLogE = 2 * logE;
uint32_t uint_min = 0xFBFFFFFF;
uint32_t uint_max = 0x7BFFFFFF;
float float_min = *(vx_float32 *)&uint_min;
float float_max = *(vx_float32 *)&uint_max;
float float_min = 0.0f;
float float_max = 0.0f;
float scale_val[9] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
float tail_val[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
vsi_bool is_u8_type = FALSE;
@ -1499,6 +1500,12 @@ static vsi_nn_kernel_node_t _setup
size_t lstm_activation_in_out_num = 0;
uint32_t i;
fp32_bit_cast_t fp32_bit_cast;
fp32_bit_cast.data = uint_min;
float_min = fp32_bit_cast.val;
fp32_bit_cast.data = uint_max;
float_max = fp32_bit_cast.val;
_is_ln = vsi_nn_kernel_param_get_int32( params, "_is_ln" );
_is_cifg = vsi_nn_kernel_param_get_int32( params, "_is_cifg" );
_is_proj = vsi_nn_kernel_param_get_int32( params, "_is_proj" );

View File

@ -266,6 +266,7 @@ static vsi_nn_kernel_node_t _setup
vsi_bool is_use_u8_kernel = FALSE;
int32_t align_corners =
vsi_nn_kernel_param_get_int32(params, "align_corners");
int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
uint32_t pad_val = 0;
int32_t depth = 0;
vsi_nn_kernel_dtype_e in0_dtype;
@ -282,6 +283,11 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
{
return NULL;
}
final_tensors[0] = inputs[0];
if (inputs[1]->attr.dim_num >= 3) {
final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
@ -382,8 +388,15 @@ static vsi_nn_kernel_node_t _setup
{
// Set default border mode.
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U32 = pad_val;
if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
{
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U32 = pad_val;
}
else
{
border.mode = VX_BORDER_REPLICATE;
}
status = vxSetNodeAttribute(
(vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
CHECK_STATUS(status);

View File

@ -36,6 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_dtype_util_prv.h"
__BEGIN_DECLS
@ -242,6 +243,7 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t suffix_dim_size = 0;
int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
vsi_nn_kernel_dtype_e out_dtype;
fp32_bit_cast_t fp32_bit_cast;
uint32_t data[2] = {0};
float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
@ -258,8 +260,11 @@ static vsi_nn_kernel_node_t _setup
}
else
{
data[0] = *(uint32_t*)&on_value;
data[1] = *(uint32_t*)&off_value;
fp32_bit_cast.val = on_value;
data[0] = fp32_bit_cast.data;
fp32_bit_cast.val = off_value;
data[1] = fp32_bit_cast.data;
}
axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis;

View File

@ -34,20 +34,24 @@
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
#define _TOPK_KERNEL_SOURCE "topk"
#define STR(a) #a
// Add kernel hashtable here
#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26))
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \
CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
_TOPK_KERNEL_SOURCE }
#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \
CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
"topk2" }
#define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
#define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
@ -111,6 +115,9 @@ static const _kernel_map_type _topk_kernel_map[] =
PACK_KERNEL_MAP( F32, I32, 4 ),
PACK_KERNEL_MAP( F32, I32, 5 ),
PACK_KERNEL_MAP( F32, I32, 6 ),
PACK_MERGE_KERNEL_MAP(U32, U32),
PACK_MERGE_KERNEL_MAP(I32, I32),
};
static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@ -254,7 +261,8 @@ static vsi_status _query_kernel
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t num_stages
int32_t num_stages,
vsi_bool is_bitnoic_segment
)
{
vsi_status status = VSI_FAILURE;
@ -272,21 +280,23 @@ static vsi_status _query_kernel
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
num_stages = is_bitnoic_segment ? 0 : num_stages;
switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
{
case _PACK_SELECT_KEY(F32, F32):
case _PACK_SELECT_KEY(F16, F16):
key = TOPK_HASH_KEY( F32, F32, num_stages );
key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment );
break;
case _PACK_SELECT_KEY(U32, U32):
case _PACK_SELECT_KEY(U16, U16):
case _PACK_SELECT_KEY(U8, U8):
key = TOPK_HASH_KEY( U32, U32, num_stages );
key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment );
break;
case _PACK_SELECT_KEY(I32, I32):
case _PACK_SELECT_KEY(I16, I16):
case _PACK_SELECT_KEY(I8, I8):
key = TOPK_HASH_KEY( I32, I32, num_stages );
key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment );
break;
case _PACK_SELECT_KEY(F32, U32):
case _PACK_SELECT_KEY(F16, U32):
@ -294,7 +304,7 @@ static vsi_status _query_kernel
case _PACK_SELECT_KEY(F16, U16):
case _PACK_SELECT_KEY(F32, U8):
case _PACK_SELECT_KEY(F16, U8):
key = TOPK_HASH_KEY( F32, U32, num_stages );
key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment );
break;
case _PACK_SELECT_KEY(F32, I32):
case _PACK_SELECT_KEY(F16, I32):
@ -302,7 +312,7 @@ static vsi_status _query_kernel
case _PACK_SELECT_KEY(F16, I16):
case _PACK_SELECT_KEY(F32, I8):
case _PACK_SELECT_KEY(F16, I8):
key = TOPK_HASH_KEY( F32, I32, num_stages );
key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment );
break;
default:
break;
@ -440,7 +450,12 @@ static vsi_nn_kernel_node_t _setup
int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
vsi_bool is_odd_even_sort = FALSE;
vsi_bool is_bitnoic_segment = FALSE;
size_t param_num = _TOPK_PARAM_NUM;
int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
@ -471,9 +486,14 @@ static vsi_nn_kernel_node_t _setup
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shape[0], 2 );
if (num_stages < 7)
is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) &&
type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32);
num_stages = is_bitnoic_segment ? 9 : num_stages;
max_stages = is_bitnoic_segment ? max_stages : 7;
if (num_stages < max_stages || is_bitnoic_segment)
{
status = _query_kernel( kernel, inputs, outputs, num_stages );
status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment );
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shape[1], 2 );

View File

@ -52,15 +52,26 @@ typedef enum
#define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(_input_type, _output_type) \
"bilinear_grid_sample_" #_input_type "_to_" #_output_type
#define _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE(_input_type, _output_type) \
"bilinear_grid_sample_reflect_" #_input_type "_to_" #_output_type
// Add kernel hashtable here
#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, REFLECT) \
((IN1_DTYPE << 24) | (IN0_DTYPE << 16) | (OUT_DTYPE << 8) | (REFLECT))
#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
{ \
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
_BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \
}
{ \
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
_BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \
}
#define PACK_REFLECT_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
{ \
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_reflect_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
_BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \
}
typedef struct
{
@ -83,6 +94,18 @@ static const _kernel_map_type _bilinear_grid_sample_kernel_map[] =
PACK_KERNEL_MAP(I16, I16, I16),
PACK_KERNEL_MAP(I8, I8, I8),
PACK_KERNEL_MAP(BF16, BF16, BF16),
PACK_REFLECT_KERNEL_MAP(F16, F32, F16),
PACK_REFLECT_KERNEL_MAP(F16, U8, F16),
PACK_REFLECT_KERNEL_MAP(F16, F16, F16),
PACK_REFLECT_KERNEL_MAP(F16, F32, U8),
PACK_REFLECT_KERNEL_MAP(F16, F16, U8),
PACK_REFLECT_KERNEL_MAP(F16, U8, U8),
PACK_REFLECT_KERNEL_MAP(U8, U8, U8),
PACK_REFLECT_KERNEL_MAP(U8, F16, U8),
PACK_REFLECT_KERNEL_MAP(U8, F32, U8),
PACK_REFLECT_KERNEL_MAP(I16, I16, I16),
PACK_REFLECT_KERNEL_MAP(I8, I8, I8),
PACK_REFLECT_KERNEL_MAP(BF16, BF16, BF16),
};
@ -96,18 +119,20 @@ static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] =
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _BILINEAR_GRID_SAMPLE_PARAM_NUM _cnt_of_array( _bilinear_grid_sample_kernel_param_def )
#define _BILINEAR_GRID_SAMPLE_PARAM_NUM \
_cnt_of_array( _bilinear_grid_sample_kernel_param_def )
#define SCALAR_ALIGN_CORNERS (3)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
static vsi_status _bilinear_grid_sample_initializer_base
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
size_t param_size,
vsi_bool is_reflect_mode
)
{
vsi_status status = VSI_FAILURE;
@ -135,6 +160,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
int32_t input1ZP = 0;
float output_scale = 1.0;
int32_t outputZP = 0;
float min_val_wh[4] = { 0 };
float span_wh[4] = { 0 };
VSI_UNREFERENCED(param_size);
@ -156,6 +183,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
(vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
CHECK_STATUS_FAIL_GOTO(status, final);
out_shape = output_attr->shape;
in0_shape = input_attr[0]->shape;
input0_dtype = input_attr[0]->dtype;
@ -193,6 +221,35 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "add_float_value", add_float_value);
status |= vsi_nn_kernel_gpu_add_param(node, "depth", &depth);
if (is_reflect_mode)
{
float low_w, low_h, high_w, high_h;
if (align_corners)
{
low_w = 0;
low_h = 0;
high_w = 2 * (float)(in0_width - 1);
high_h = 2 * (float)(in0_height - 1);
}
else
{
low_w = -1;
low_h = -1;
high_w = 2 * (float)in0_width - 1;
high_h = 2 * (float)in0_height - 1;
}
min_val_wh[0] = low_w / 2;
span_wh[0] = (high_w - low_w) / 2;
min_val_wh[1] = low_h / 2;
span_wh[1] = (high_h - low_h) / 2;
min_val_wh[2] = min_val_wh[0];
min_val_wh[3] = min_val_wh[1];
span_wh[2] = span_wh[0];
span_wh[3] = span_wh[1];
status |= vsi_nn_kernel_gpu_add_param(node, "span_wh", span_wh);
status |= vsi_nn_kernel_gpu_add_param(node, "min_val_wh", min_val_wh);
}
{
gpu_dp_inst_t uniFp16toFp32_part0_4x4 = {
{
@ -538,6 +595,28 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t* param,
size_t param_size
)
{
return _bilinear_grid_sample_initializer_base(
node, param, param_size, vx_false_e);
}
DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_reflect_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t* param,
size_t param_size
)
{
return _bilinear_grid_sample_initializer_base(
node, param, param_size, vx_true_e);
}
/*
* Query kernel
*/
@ -545,7 +624,8 @@ static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
vsi_nn_tensor_t * const * const outputs,
int32_t is_reflect_mode
)
{
vsi_status status = VSI_FAILURE;
@ -563,7 +643,16 @@ static vsi_status _query_kernel
in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, is_reflect_mode);
if (is_reflect_mode)
{
initializer = _bilinear_grid_sample_reflect_initializer;
}
else
{
initializer = _bilinear_grid_sample_initializer;
}
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
@ -605,13 +694,21 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
uint32_t final_in1_rank = 0;
vsi_size_t final_out_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
uint32_t final_in1_rank = 0, final_out_rank = 0;
vsi_nn_tensor_t* rs_tensors = NULL;
vsi_nn_tensor_t* rs_out_tensors = NULL;
vsi_nn_tensor_t* final_tensors[3] = {NULL};
vsi_nn_kernel_dtype_e in0_dtype;
uint32_t pad_val = 0;
int32_t align_corners =
vsi_nn_kernel_param_get_int32(params, "align_corners");
int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
int32_t is_reflect_mode = 0;
vsi_size_t in_size_x = inputs[1]->attr.size[1];
vsi_size_t in_size_y = inputs[1]->attr.dim_num >= 3 ? inputs[1]->attr.size[2] : 1;
vsi_size_t new_size_x = in_size_x, new_size_y = in_size_y;
vsi_bool is_reshape_out = vx_false_e;
// Check if gpu can support the size
if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
@ -624,12 +721,63 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
{
is_reflect_mode = 1;
}
final_tensors[0] = inputs[0];
is_reshape_out = vx_false_e;
if (inputs[1]->attr.dim_num >= 3) {
vsi_size_t shape_x[2];
vsi_size_t out_shape_x[2];
vsi_size_t out_rank_x;
shape_x[0] = in_size_x;
shape_x[1] = in_size_y;
vsi_nn_kernel_optimize_element_shape(shape_x, 2, out_shape_x, &out_rank_x);
if (out_rank_x == 2)
{
new_size_x = out_shape_x[0];
new_size_y = out_shape_x[1];
}
final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
final_shape[1] = inputs[1]->attr.size[2];
if ((new_size_x == in_size_x) && (new_size_y == in_size_y))
{
is_reshape_out = vx_false_e;
}
else if ((new_size_x * 2) >= GPU_TENSOR_MAX_WIDTH)
{
is_reshape_out = vx_false_e;
}
else
{
is_reshape_out = vx_true_e;
}
if (is_reshape_out == vx_false_e)
{
new_size_x = in_size_x;
new_size_y = in_size_y;
if ((new_size_x < new_size_y) && ((new_size_y * 2) < GPU_TENSOR_MAX_WIDTH))
{
vsi_size_t tmp = new_size_x;
new_size_x = new_size_y;
new_size_y = tmp;
is_reshape_out = vx_true_e;
}
}
}
if (((new_size_x * 2) >= GPU_TENSOR_MAX_WIDTH) || (new_size_y >= GPU_TENSOR_MAX_WIDTH))
{
return NULL;
}
if (inputs[1]->attr.dim_num >= 3) {
final_shape[0] = new_size_x * inputs[1]->attr.size[0];
final_shape[1] = new_size_y;
final_shape[2] = 1;
final_shape[3] = inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1;
final_in1_rank =
@ -643,14 +791,32 @@ static vsi_nn_kernel_node_t _setup
} else {
final_tensors[1] = inputs[1];
}
final_tensors[2] = outputs[0];
if (is_reshape_out)
{
final_out_shape[0] = new_size_x;
final_out_shape[1] = new_size_y;
final_out_shape[2] = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1;
final_out_shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
final_out_rank = outputs[0]->attr.dim_num;
if (!vsi_nn_kernel_gpu_check_shape(final_out_shape, final_out_rank)) {
return NULL;
}
rs_out_tensors = vsi_nn_reshape_tensor(graph, outputs[0], final_out_shape, final_out_rank);
final_tensors[2] = rs_out_tensors;
}
else
{
final_tensors[2] = outputs[0];
}
in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
if (U8 == in0_dtype) {
pad_val = inputs[0]->attr.dtype.zero_point;
}
status = _query_kernel( kernel, inputs, outputs );
status = _query_kernel( kernel, inputs, outputs, is_reflect_mode);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
@ -662,14 +828,22 @@ static vsi_nn_kernel_node_t _setup
node_params[SCALAR_ALIGN_CORNERS] =
vsi_nn_kernel_scalar_create(graph, I32, &align_corners);
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM );
status = vsi_nn_kernel_node_pass_param(
node, node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM );
VSI_ASSERT(status == VSI_SUCCESS);
vsi_nn_kernel_scalar_release(&node_params[SCALAR_ALIGN_CORNERS]);
{
// Set default border mode.
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U32 = pad_val;
if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
{
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U32 = pad_val;
}
else
{
border.mode = VX_BORDER_REPLICATE;
}
status = vxSetNodeAttribute(
(vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
CHECK_STATUS(status);
@ -678,6 +852,7 @@ static vsi_nn_kernel_node_t _setup
}
vsi_safe_release_tensor(rs_tensors);
vsi_safe_release_tensor(rs_out_tensors);
return node;
} /* _setup() */

View File

@ -513,6 +513,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
}
if ( node )
{
// Set default border mode.
vx_border_t border;

View File

@ -34,6 +34,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util_prv.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
@ -1002,8 +1003,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
float twoLogE = 2 * logE;
uint32_t uint_min = 0xFBFFFFFF;
uint32_t uint_max = 0x7BFFFFFF;
float float_min = *(float *)&uint_min;
float float_max = *(float *)&uint_max;
float float_min = 0.0f;
float float_max = 0.0f;
float clip_Min_F[4] = {0};
float clip_Max_F[4] = {0};
uint32_t i = 0;
@ -1017,6 +1018,12 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
vsi_nn_kernel_tensor_attr_t* input_attr[9] = {NULL};
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL};
fp32_bit_cast_t fp32_bit_cast;
fp32_bit_cast.data = uint_min;
float_min = fp32_bit_cast.val;
fp32_bit_cast.data = uint_max;
float_max = fp32_bit_cast.val;
status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 5], &_is_ln );
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 4], &_is_cifg );

View File

@ -539,6 +539,7 @@ static vsi_nn_kernel_node_t _setup
uint32_t pad_val = 0;
int32_t align_corners =
vsi_nn_kernel_param_get_int32(params, "align_corners");
int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
// Check if gpu can support the size
if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
@ -551,6 +552,11 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
{
return NULL;
}
final_tensors[0] = inputs[0];
if (inputs[1]->attr.dim_num >= 3) {
@ -596,8 +602,15 @@ static vsi_nn_kernel_node_t _setup
{
// Set default border mode.
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U32 = pad_val;
if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
{
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U32 = pad_val;
}
else
{
border.mode = VX_BORDER_REPLICATE;
}
status = vxSetNodeAttribute(
(vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
CHECK_STATUS(status);

View File

@ -754,7 +754,7 @@ static vsi_nn_kernel_node_t _setup
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
goto final;
}
if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&

View File

@ -915,7 +915,7 @@ static vsi_nn_kernel_node_t _setup
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
goto final;
}
if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&

View File

@ -290,13 +290,14 @@ static vsi_nn_tensor_t* _create_scale_tensor
vsi_nn_tensor_t* scale = NULL;
vsi_size_t i = 0;
float *scale_data_ptr = NULL;
int *index_data_ptr = NULL;
int32_t *index_data_ptr = NULL;
float scale_value = 0;
vsi_ssize_t data = 0;
int idx = 0;
int32_t idx = 0;
float delta_v = 0;
float cubic_coeff_a = -0.5f;
vsi_size_t item_count = 4 * output_size;
scale_data_ptr = (float *)malloc(item_count * sizeof(float));
if (scale_data_ptr == NULL)
{
@ -316,7 +317,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
scale_value = ((float)i + half_pixel_value) * scale_factor - half_pixel_value;
data = (vsi_ssize_t)scale_value;
delta_v = scale_value - (float)data;
idx = (int)data - 1;
idx = (int32_t)data - 1;
index_data_ptr[i] = idx;
scale_data_ptr[i * 4 + 0] = cubic_coeff_a * (((delta_v - 4) * (delta_v + 1) + 8) * (delta_v + 1) - 4);
@ -331,11 +332,6 @@ static vsi_nn_tensor_t* _create_scale_tensor
attr.vtl = FALSE;
scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr);
if (scale_data_ptr)
{
free (scale_data_ptr);
scale_data_ptr = NULL;
}
attr.size[0] = output_size;
attr.dim_num = 1;
@ -343,13 +339,11 @@ static vsi_nn_tensor_t* _create_scale_tensor
attr.vtl = FALSE;
*index = vsi_nn_CreateTensorFromData(graph, (uint8_t *)index_data_ptr, &attr);
if (index_data_ptr)
{
free (index_data_ptr);
index_data_ptr = NULL;
}
OnError:
vsi_nn_safe_free(scale_data_ptr);
vsi_nn_safe_free(index_data_ptr);
return scale;
}

View File

@ -1218,6 +1218,11 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
status = backend->select( graph, inputs, input_num, outputs, output_num,
params, &selector );
VSI_ASSERT( status == VSI_SUCCESS );
if ( status != VSI_SUCCESS ) {
VSILOGW("Failed to select kernel \"%s\"", kernel_name);
return NULL;
}
}
else
{

View File

@ -171,7 +171,9 @@ vsi_status vsi_nn_kernel_copy_tensor_patch
vsi_nn_kernel_tensor_attr_get_stride( attr, stride );
memset(start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
memset(end, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
for (i = 0; i < (uint32_t)attr->shape->size; i++)
{
end[i] = attr->shape->data[i];
if ( attr->dtype != I4 && attr->dtype != U4 )
@ -490,7 +492,7 @@ vsi_status vsi_nn_kernel_scalar_get_dtype
( vsi_nn_kernel_scalar_t scalar, DTYPE * ptr ) \
{ \
vsi_status status; \
vsi_nn_kernel_dtype_e dtype; \
vsi_nn_kernel_dtype_e dtype = INVALID_DTYPE; \
if( !ptr ) \
{ \
VSILOGE("Pointer to store scalar is null"); \

View File

@ -0,0 +1,94 @@
/****************************************************************************
*
* Copyright (c) 2021 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#define REGISTER_RMS_NORM_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_RMS_NORM_OPENVX_KERNEL(rms_norm)
{
vx_node node = NULL;
#if (VX_RMS_NORM_VX_SUPPORT)
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
vx_tensor inputs_tensor[2] = {NULL};
vx_tensor output_tensor = NULL;
VSI_UNREFERENCED(kernel);
VSI_UNREFERENCED(output_num);
inputs_tensor[0] = inputs[0]->t;
inputs_tensor[1] = inputs[1]->t;
output_tensor = outputs[0]->t;
node = vxRMSNormalizationLayer(
graph->g,
eps,
axis,
inputs_tensor,
(uint32_t)input_num,
output_tensor
);
#else
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(graph);
VSI_UNREFERENCED(inputs);
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(outputs);
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(params);
VSI_UNREFERENCED(kernel);
#endif
return (vsi_nn_kernel_node_t)node;
} /* rms_norm() */

View File

@ -0,0 +1,169 @@
__kernel void bilinear_grid_sample_reflect_F32_F32toF32(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
float half_input0_w,
float half_input0_h,
float add_float_value_w,
float add_float_value_h,
int depth,
float min_val_w,
float span_w,
float min_val_h,
float span_h
)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));
int2 coord_add = (int2)(-1, 1);
float fx = read_imagef(input1, coord_in1).x;
coord_in1.x = coord_in1.x + 1;
float fy = read_imagef(input1, coord_in1).x;
fx = fx * half_input0_w + add_float_value_w;
fy = fy * half_input0_h + add_float_value_h;
if (span_w > 0)
{
fx = fabs(fx - min_val_w);
int flips_x = (int)(fx / span_w);
float extra_x = fx - flips_x * span_w;
fx = (flips_x & 0x01) ? min_val_w + (span_w - extra_x) : min_val_w + extra_x ;
}
else
{
fx = 0;
}
if (span_h > 0)
{
fy = fabs(fy - min_val_h);
int flips_y = (int)(fy / span_h);
float extra_y = fy - flips_y * span_h;
fy = (flips_y & 0x01) ? min_val_h + (span_h - extra_y) : min_val_h + extra_y ;
}
else
{
fy = 0;
}
float x_f = floor(fx);
float y_f = floor(fy);
float x_lerp = fx - x_f;
float y_lerp = fy - y_f;
int x_index = convert_int(x_f);
int y_index = convert_int(y_f);
int4 coord_in = (int4)(x_index, y_index, 0, 0);
float4 top_l, top_r, bottom_l, bottom_r, top, bottom, dst;
while (coord_in.z < depth){
top_l = read_imagef(input0, coord_in);
coord_in.y++;
bottom_l = read_imagef(input0, coord_in);
coord_in.x++;
bottom_r = read_imagef(input0, coord_in);
coord_in.y--;
top_r = read_imagef(input0, coord_in);
top_r = top_r - top_l;
top = top_l + x_lerp * top_r;
bottom_r = bottom_r - bottom_l;
bottom = bottom_l + x_lerp * bottom_r;
bottom = bottom - top;
dst = top + y_lerp * bottom;
write_imagef(output, coord_out, dst);
coord_in.xz = coord_in.xz + coord_add;
coord_out.z++;
}
}
__kernel void bilinear_grid_sample_reflect_U8_U8toU8(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
float half_input0_w,
float half_input0_h,
float add_float_value_w,
float add_float_value_h,
int depth,
float in0_scale,
float in0_tail,
float in1_scale,
float in1_tail,
float out_scale,
float out_tail,
float min_val_w,
float span_w,
float min_val_h,
float span_h
)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));
int2 coord_add = (int2)(-1, 1);
float fx = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
coord_in1.x = coord_in1.x + 1;
float fy = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
fx = fx * half_input0_w + add_float_value_w;
fy = fy * half_input0_h + add_float_value_h;
if (span_w > 0)
{
fx = fabs(fx - min_val_w);
int flips_x = (int)(fx / span_w);
float extra_x = fx - flips_x * span_w;
fx = (flips_x & 0x01) ? min_val_w + (span_w - extra_x) : min_val_w + extra_x ;
}
else
{
fx = 0;
}
if (span_h > 0)
{
fy = fabs(fy - min_val_h);
int flips_y = (int)(fy / span_h);
float extra_y = fy - flips_y * span_h;
fy = (flips_y & 0x01) ? min_val_h + (span_h - extra_y) : min_val_h + extra_y ;
}
else
{
fy = 0;
}
float x_f = floor(fx);
float y_f = floor(fy);
float x_lerp = fx - x_f;
float y_lerp = fy - y_f;
int x_index = convert_int(x_f);
int y_index = convert_int(y_f);
int4 coord_in = (int4)(x_index, y_index, 0, 0);
float4 top_l, top_r, bottom_l, bottom_r, top, bottom;
uint4 dst;
while (coord_in.z < depth){
top_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
coord_in.y++;
bottom_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
coord_in.x++;
bottom_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
coord_in.y--;
top_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
top_r = top_r - top_l;
top = top_l + x_lerp * top_r;
bottom_r = bottom_r - bottom_l;
bottom = bottom_l + x_lerp * bottom_r;
bottom = bottom - top;
top = top + y_lerp * bottom;
dst = convert_uint4_rte(top * out_scale + out_tail);
write_imageui(output, coord_out, dst);
coord_in.xz = coord_in.xz + coord_add;
coord_out.z++;
}
}

View File

@ -1,4 +1,4 @@
#define VSI_FLOAT32_MIN (1.175494351e-38F)
#define VSI_FLOAT32_MIN (-3.40E+38)
#define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val, read_func, write_func, conv_func) \
__kernel void maxpool_##in_name##to##out_name( \
@ -45,7 +45,7 @@ __kernel void maxpool_##in_name##to##out_name( \
{ \
src0 = read_func(input, coord_in); \
coord_in.x += dilation_x; \
maxVal = max(src0, maxVal); \
maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x; \
} \
} \
\
@ -101,7 +101,7 @@ __kernel void maxpool_F32toF32(
{
src0 = read_imagef(input, coord_in);
coord_in.x += dilation_x;
maxVal = max(src0, maxVal);
maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
}
}
@ -152,7 +152,7 @@ __kernel void maxpool_U32toF32(
{
src0 = read_imageui(input, coord_in);
coord_in.x += dilation_x;
maxVal = max(src0, maxVal);
maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
}
}
@ -206,7 +206,7 @@ __kernel void maxpool_F32toU32(
{
src0 = read_imagef(input, coord_in);
coord_in.x += dilation_x;
maxVal = max(src0, maxVal);
maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
}
}

View File

@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \
float right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
uint left_elem = local_data[left_id]; \
uint right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -227,7 +227,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
int left_elem = local_data[left_id]; \
int right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -315,7 +315,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \
float right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -403,7 +403,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \
float right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \

View File

@ -0,0 +1,368 @@
#define BITONIC_STEP(dtype) \
void bitonic_step_##dtype(uint num_stages, int lx, \
__local dtype *local_data, __local int *local_indices) \
{ \
for (uint stage = 0; stage < num_stages + 1; ++stage) \
{ \
uint signo = (lx >> stage) & 1; \
\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
uint postShift = (stage - passOfStage); \
uint pairDistance = 1 << postShift; \
\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
uint right_id = left_id + pairDistance; \
\
int left_idx = local_indices[left_id]; \
int right_idx = local_indices[right_id]; \
\
dtype left_elem = local_data[left_id]; \
dtype right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
\
local_indices[left_id] = right_idx; \
local_indices[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
} \
}
BITONIC_STEP(int)
BITONIC_STEP(uint)
#define BITONIC_STEP_ASCEND(dtype) \
void bitonic_step_ascend_##dtype(uint num_stages, int lx, \
__local dtype *p_share_k, __local int *p_share_v) \
{ \
for (uint stage = 0; stage < num_stages + 1; ++stage) \
{ \
uint signo = (lx >> stage) & 1; \
\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
uint postShift = (stage - passOfStage); \
uint pairDistance = 1 << postShift; \
\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
uint right_id = left_id + pairDistance; \
\
int left_idx = p_share_v[left_id]; \
int right_idx = p_share_v[right_id]; \
\
dtype left_elem = p_share_k[left_id]; \
dtype right_elem = p_share_k[right_id]; \
\
if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \
{ \
p_share_k[left_id] = right_elem; \
p_share_k[right_id] = left_elem; \
\
p_share_v[left_id] = right_idx; \
p_share_v[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
} \
}
BITONIC_STEP_ASCEND(int)
BITONIC_STEP_ASCEND(uint)
#define BITONIC_MERGE(dtype) \
void bitonic_merge_##dtype(uint num_stages, int lx, \
__local dtype *local_data, __local int *local_indices) \
{ \
uint stage = num_stages; \
uint signo = (lx >> stage) & 1; \
\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
uint postShift = (stage - passOfStage); \
uint pairDistance = 1 << postShift; \
\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
uint right_id = left_id + pairDistance; \
\
int left_idx = local_indices[left_id]; \
int right_idx = local_indices[right_id]; \
\
dtype left_elem = local_data[left_id]; \
dtype right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
\
local_indices[left_id] = right_idx; \
local_indices[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
}
BITONIC_MERGE(int)
BITONIC_MERGE(uint)
#define BLOCK_SIZE (512)
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32
(
__read_only image2d_t input,
__write_only image2d_t output,
__write_only image2d_t indices,
float input_scale,
float input_tail,
float output_scale,
float output_tail,
int _num_stages,
int width
)
{
uint lx = get_local_id(0);
const int init_k = -2147483647;
const int init_v = -2147483647;
const int num_stages = 9;
const int threads_per_block = BLOCK_SIZE;
const int index_minus_1 = threads_per_block * 2 - 1;
uint offset = 0;
uint lx1 = lx + threads_per_block;
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
__local int local_data[1536];
__local int local_indices[1536];
int left = read_imagei(input, coord.xy).x;
coord.z += threads_per_block;
int right = read_imagei(input, coord.zy).x;
local_data[lx] = left;
local_indices[lx] = coord.x;
local_data[lx1] = right;
local_indices[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_int(num_stages, lx, local_data, local_indices);
int min_data = local_data[511];
int *p_share_k = local_data + threads_per_block;
int *p_share_v = local_indices + threads_per_block;
int limit = (width >> 10) << 10;
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
barrier(CLK_LOCAL_MEM_FENCE);
for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
{
int2 data;
coord.z = coord.x + threads_per_block;
data.x = read_imagei(input, coord.xy).x;
data.y = read_imagei(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = data.y;
p_share_v[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] < min_data)
{
continue;
}
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_int(num_stages, lx, local_data, local_indices);
min_data = local_data[511];
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
}
if (width > limit)
{
if (coord.x < width)
{
int2 data;
data.x = read_imagei(input, coord.xy).x;
coord.z = coord.x + threads_per_block;
data.y = read_imagei(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = coord.z < width ? data.y : init_k;
p_share_v[lx1] = coord.z < width ? coord.z : init_v;
}
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] >= min_data)
{
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_int(num_stages, lx, local_data, local_indices);
}
}
int4 dst;
dst.x = local_data[lx];
coord.x = lx;
write_imagei(output, coord.xy, dst.xxxx);
int4 index;
index.x = local_indices[lx];
write_imagei(indices, coord.xy, index.xxxx);
}
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32
(
__read_only image2d_t input,
__write_only image2d_t output,
__write_only image2d_t indices,
float input_scale,
float input_tail,
float output_scale,
float output_tail,
int _num_stages,
int width
)
{
uint lx = get_local_id(0);
const uint init_k = 0;
const int init_v = -2147483647;
const int num_stages = 9;
const int threads_per_block = BLOCK_SIZE;
const int index_minus_1 = threads_per_block * 2 - 1;
uint offset = 0;
uint lx1 = lx + threads_per_block;
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
__local uint local_data[1536];
__local int local_indices[1536];
uint left = read_imageui(input, coord.xy).x;
coord.z += threads_per_block;
uint right = read_imageui(input, coord.zy).x;
local_data[lx] = left;
local_indices[lx] = coord.x;
local_data[lx1] = right;
local_indices[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_uint(num_stages, lx, local_data, local_indices);
uint min_data = local_data[511];
uint *p_share_k = local_data + threads_per_block;
int *p_share_v = local_indices + threads_per_block;
int limit = (width >> 10) << 10;
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
barrier(CLK_LOCAL_MEM_FENCE);
for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
{
uint2 data;
coord.z = coord.x + threads_per_block;
data.x = read_imageui(input, coord.xy).x;
data.y = read_imageui(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = data.y;
p_share_v[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] < min_data)
{
continue;
}
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_uint(num_stages, lx, local_data, local_indices);
min_data = local_data[511];
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
}
if (width > limit)
{
if (coord.x < width)
{
uint2 data;
data.x = read_imageui(input, coord.xy).x;
coord.z = coord.x + threads_per_block;
data.y = read_imageui(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = coord.z < width ? data.y : init_k;
p_share_v[lx1] = coord.z < width ? coord.z : init_v;
}
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] >= min_data)
{
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_uint(num_stages, lx, local_data, local_indices);
}
}
uint4 dst;
dst.x = local_data[lx];
coord.x = lx;
write_imageui(output, coord.xy, dst.xxxx);
int4 index;
index.x = local_indices[lx];
write_imagei(indices, coord.xy, index.xxxx);
}

View File

@ -28,12 +28,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
}
__local int sorted[1];
int width_minus_one = width - 1;
int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
int x_start = lid * num_pixels_per_thread;
int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
sorted[0] = 0;
@ -44,20 +38,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
*sorted = 0;
}
int swapped = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
// odd-even
coord.x = x_start;
coord.z = x_start + 1;
for (; coord.x < x_end; )
coord.x = lid * 2;
coord.z = lid * 2 + 1;
for (; coord.z < width; )
{
float4 left = read_imagef(input_t, coord.xy);
float4 right = read_imagef(input_t, coord.zy);
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
if (left.x < right.x)
if ( (left.x < right.x) ||
(left.x == right.x && l_index.x < r_index.x) )
{
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
swapped = 1;
write_imagef(input_t, coord.xy, right);
@ -67,21 +62,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
write_imagei(indices_t, coord.zy, l_index);
}
coord.xz = coord.xz + 2;
coord.xz += 2 * LOCAL_SIZE_X;
}
barrier(CLK_GLOBAL_MEM_FENCE);
// even-odd
coord.x = x_start + 1;
coord.z = x_start + 2;
for (; coord.x < x_end; )
coord.x = lid * 2 + 1;
coord.z = lid * 2 + 2;
for (; coord.z < width; )
{
float4 left = read_imagef(input_t, coord.xy);
float4 right = read_imagef(input_t, coord.zy);
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
if (left.x < right.x)
if ( (left.x < right.x) ||
(left.x == right.x && l_index.x < r_index.x) )
{
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
swapped = 1;
write_imagef(input_t, coord.xy, right);
@ -91,11 +88,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
write_imagei(indices_t, coord.zy, l_index);
}
coord.xz = coord.xz + 2;
coord.xz += 2 * LOCAL_SIZE_X;
}
atomic_add(sorted, swapped);
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (*sorted == 0)
break;
@ -141,13 +138,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
}
__local int sorted[1];
int width_minus_one = width - 1;
int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
int x_start = lid * num_pixels_per_thread;
int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
sorted[0] = 0;
while (1)
@ -157,20 +147,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
*sorted = 0;
}
int swapped = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
// odd-even
coord.x = x_start;
coord.z = x_start + 1;
for (; coord.x < x_end; )
coord.x = lid * 2;
coord.z = lid * 2 + 1;
for (; coord.z < width; )
{
uint4 left = read_imageui(input_t, coord.xy);
uint4 right = read_imageui(input_t, coord.zy);
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
if (left.x < right.x)
if ( (left.x < right.x) ||
(left.x == right.x && l_index.x < r_index.x) )
{
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
swapped = 1;
write_imageui(input_t, coord.xy, right);
@ -180,21 +171,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
write_imagei(indices_t, coord.zy, l_index);
}
coord.xz = coord.xz + 2;
coord.xz += 2 * LOCAL_SIZE_X;
}
barrier(CLK_GLOBAL_MEM_FENCE);
// even-odd
coord.x = x_start + 1;
coord.z = x_start + 2;
for (; coord.x < x_end; )
coord.x = lid * 2 + 1;
coord.z = lid * 2 + 2;
for (; coord.z < width; )
{
uint4 left = read_imageui(input_t, coord.xy);
uint4 right = read_imageui(input_t, coord.zy);
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
if (left.x < right.x)
if ( (left.x < right.x) ||
(left.x == right.x && l_index.x < r_index.x) )
{
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
swapped = 1;
write_imageui(input_t, coord.xy, right);
@ -204,11 +197,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
write_imagei(indices_t, coord.zy, l_index);
}
coord.xz = coord.xz + 2;
coord.xz += 2 * LOCAL_SIZE_X;
}
atomic_add(sorted, swapped);
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (*sorted == 0)
break;
@ -254,13 +247,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
}
__local int sorted[1];
int width_minus_one = width - 1;
int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
int x_start = lid * num_pixels_per_thread;
int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
sorted[0] = 0;
while (1)
@ -270,20 +256,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
*sorted = 0;
}
int swapped = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
// odd-even
coord.x = x_start;
coord.z = x_start + 1;
for (; coord.x < x_end; )
coord.x = lid * 2;
coord.z = lid * 2 + 1;
for (; coord.z < width; )
{
int4 left = read_imagei(input_t, coord.xy);
int4 right = read_imagei(input_t, coord.zy);
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
if (left.x < right.x)
if ( (left.x < right.x) ||
(left.x == right.x && l_index.x < r_index.x) )
{
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
swapped = 1;
write_imagei(input_t, coord.xy, right);
@ -293,21 +280,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
write_imagei(indices_t, coord.zy, l_index);
}
coord.xz = coord.xz + 2;
coord.xz += 2 * LOCAL_SIZE_X;
}
barrier(CLK_GLOBAL_MEM_FENCE);
// even-odd
coord.x = x_start + 1;
coord.z = x_start + 2;
for (; coord.x < x_end; )
coord.x = lid * 2 + 1;
coord.z = lid * 2 + 2;
for (; coord.z < width; )
{
int4 left = read_imagei(input_t, coord.xy);
int4 right = read_imagei(input_t, coord.zy);
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
if (left.x < right.x)
if ( (left.x < right.x) ||
(left.x == right.x && l_index.x < r_index.x) )
{
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
swapped = 1;
write_imagei(input_t, coord.xy, right);
@ -317,11 +306,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
write_imagei(indices_t, coord.zy, l_index);
}
coord.xz = coord.xz + 2;
coord.xz += 2 * LOCAL_SIZE_X;
}
atomic_add(sorted, swapped);
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (*sorted == 0)
break;

View File

@ -28,12 +28,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
}
__local int sorted[1];
int width_minus_one = width - 1;
int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
int x_start = lid * num_pixels_per_thread;
int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
sorted[0] = 0;
@ -44,20 +38,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
*sorted = 0;
}
int swapped = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
// odd-even
coord.x = x_start;
coord.z = x_start + 1;
for (; coord.x < x_end; )
coord.x = lid * 2;
coord.z = lid * 2 + 1;
for (; coord.z < width; )
{
float4 left = read_imagef(input_t, coord.xy);
float4 right = read_imagef(input_t, coord.zy);
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
if (left.x < right.x)
if ( (left.x < right.x) ||
(left.x == right.x && l_index.x < r_index.x) )
{
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
swapped = 1;
write_imagef(input_t, coord.xy, right);
@ -67,21 +62,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
write_imagei(indices_t, coord.zy, l_index);
}
coord.xz = coord.xz + 2;
coord.xz += 2 * LOCAL_SIZE_X;
}
barrier(CLK_GLOBAL_MEM_FENCE);
// even-odd
coord.x = x_start + 1;
coord.z = x_start + 2;
for (; coord.x < x_end; )
coord.x = lid * 2 + 1;
coord.z = lid * 2 + 2;
for (; coord.z < width; )
{
float4 left = read_imagef(input_t, coord.xy);
float4 right = read_imagef(input_t, coord.zy);
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
if (left.x < right.x)
if ( (left.x < right.x) ||
(left.x == right.x && l_index.x < r_index.x) )
{
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
swapped = 1;
write_imagef(input_t, coord.xy, right);
@ -91,11 +88,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
write_imagei(indices_t, coord.zy, l_index);
}
coord.xz = coord.xz + 2;
coord.xz += 2 * LOCAL_SIZE_X;
}
atomic_add(sorted, swapped);
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (*sorted == 0)
break;
@ -143,13 +140,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
}
__local int sorted[1];
int width_minus_one = width - 1;
int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
int x_start = lid * num_pixels_per_thread;
int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
sorted[0] = 0;
while (1)
@ -159,20 +149,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
*sorted = 0;
}
int swapped = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
// odd-even
coord.x = x_start;
coord.z = x_start + 1;
for (; coord.x < x_end; )
coord.x = lid * 2;
coord.z = lid * 2 + 1;
for (; coord.z < width; )
{
float4 left = read_imagef(input_t, coord.xy);
float4 right = read_imagef(input_t, coord.zy);
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
if (left.x < right.x)
if ( (left.x < right.x) ||
(left.x == right.x && l_index.x < r_index.x) )
{
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
swapped = 1;
write_imagef(input_t, coord.xy, right);
@ -182,18 +173,22 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
write_imagei(indices_t, coord.zy, l_index);
}
coord.xz = coord.xz + 2;
coord.xz += 2 * LOCAL_SIZE_X;
}
barrier(CLK_GLOBAL_MEM_FENCE);
// even-odd
coord.x = x_start + 1;
coord.z = x_start + 2;
for (; coord.x < x_end; )
coord.x = lid * 2 + 1;
coord.z = lid * 2 + 2;
for (; coord.z < width; )
{
float4 left = read_imagef(input_t, coord.xy);
float4 right = read_imagef(input_t, coord.zy);
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
if (left.x < right.x)
if ( (left.x < right.x) ||
(left.x == right.x && l_index.x < r_index.x) )
{
int4 l_index = read_imagei(indices_t, coord.xy);
int4 r_index = read_imagei(indices_t, coord.zy);
@ -206,11 +201,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
write_imagei(indices_t, coord.zy, l_index);
}
coord.xz = coord.xz + 2;
coord.xz += 2 * LOCAL_SIZE_X;
}
atomic_add(sorted, swapped);
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (*sorted == 0)
break;

View File

@ -0,0 +1,171 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float2 half_input0_wh;
_viv_uniform float2 add_float_value;
_viv_uniform int depth;
_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8;
_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;
_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;
_viv_uniform float4 span_wh;
_viv_uniform float4 min_val_wh;
#define GRID_SAMPLE_BF16_PROCESS() \
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy0 = fabs(fxy0 - min_val_wh); \
fxy1 = fabs(fxy1 - min_val_wh); \
float4 flips_xy0 = floor(fxy0 / span_wh); \
float4 flips_xy1 = floor(fxy1 / span_wh); \
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
int4 flips_int_xy0 = convert_int4(flips_xy0); \
int4 flips_int_xy1 = convert_int4(flips_xy1); \
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
float4 x_f = floor(in_x); \
float4 x_lerp = in_x - x_f; \
int4 x_idx = convert_int4(x_f); \
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
float4 y_f = floor(in_y); \
float4 y_lerp = in_y - y_f; \
int4 y_idx = convert_int4(y_f); \
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
int8 input_desc; \
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
int baseAddr = input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
vxc_short8 top; \
vxc_short8 bottom; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
float4 left4; \
float4 right4; \
float4 top4; \
float4 bottom4; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
int loop = depth - 1; \
vxc_ushort8 tmp, dst; \
while (coord_in.z < loop) \
{ \
VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
_viv_asm(COPY, right4, src, 16); \
VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
_viv_asm(COPY, left4, src, 16); \
right4 -= left4; \
top4 = right4 * x_lerp + left4; \
VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
_viv_asm(COPY, right4, src, 16); \
VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
_viv_asm(COPY, left4, src, 16); \
right4 -= left4; \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
_viv_asm(COPY, tmp, dst4, 16); \
dst.s0123 = tmp.s1357; \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
coord_in.x = x_idx.x; \
coord_in.y = y_idx.x; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
} \
VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
_viv_asm(COPY, right4, src, 16); \
VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
_viv_asm(COPY, left4, src, 16); \
right4 -= left4; \
top4 = right4 * x_lerp + left4; \
VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
_viv_asm(COPY, right4, src, 16); \
VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
_viv_asm(COPY, left4, src, 16); \
right4 -= left4; \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
_viv_asm(COPY, tmp, dst4, 16); \
dst.s0123 = tmp.s1357; \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
__kernel void bilinear_grid_sample_reflect_BF16_BF16toBF16(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
vxc_short8 read_val;
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
float4 fxy0;
float4 fxy1;
vxc_short8 src;
VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);
_viv_asm(COPY, fxy0, src, 16);
VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);
_viv_asm(COPY, fxy1, src, 16);
GRID_SAMPLE_BF16_PROCESS();
}

View File

@ -0,0 +1,217 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float2 half_input0_wh;
_viv_uniform float2 add_float_value;
_viv_uniform int depth;
_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
_viv_uniform float4 span_wh;
_viv_uniform float4 min_val_wh;
#define GRID_SAMPLE_F16_PROCESS() \
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy0 = fabs(fxy0 - min_val_wh); \
fxy1 = fabs(fxy1 - min_val_wh); \
float4 flips_xy0 = floor(fxy0 / span_wh); \
float4 flips_xy1 = floor(fxy1 / span_wh); \
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
int4 flips_int_xy0 = convert_int4(flips_xy0); \
int4 flips_int_xy1 = convert_int4(flips_xy1); \
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
float4 x_f = floor(in_x); \
float4 x_lerp = in_x - x_f; \
int4 x_idx = convert_int4(x_f); \
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
float4 y_f = floor(in_y); \
float4 y_lerp = in_y - y_f; \
int4 y_idx = convert_int4(y_f); \
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
int8 input_desc; \
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
int baseAddr = input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
vxc_short8 t0; \
vxc_short8 b0; \
vxc_half8 top; \
vxc_half8 bottom; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, top, t0, 16); \
_viv_asm(COPY, bottom, b0, 16); \
float4 left4; \
float4 right4; \
float4 top4; \
float4 bottom4; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
int loop = depth - 1; \
while (coord_in.z < loop) \
{ \
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
top4 = right4 * x_lerp + left4; \
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
half4 tmp; \
_viv_asm(CONV, tmp, dst4); \
VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \
vxc_short4 result; \
_viv_asm(COPY, result, top, 8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
coord_in.x = x_idx.x; \
coord_in.y = y_idx.x; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, top, t0, 16); \
_viv_asm(COPY, bottom, b0, 16); \
} \
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
top4 = right4 * x_lerp + left4; \
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
half4 tmp; \
_viv_asm(CONV, tmp, dst4); \
VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \
vxc_short4 result; \
_viv_asm(COPY, result, top, 8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
__kernel void bilinear_grid_sample_reflect_F16_F32toF16(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
coord_in1.z = coord_in1.z + 4;
float4 fxy0 = read_imagef(input1, coord_in1.xy);
float4 fxy1 = read_imagef(input1, coord_in1.zw);
GRID_SAMPLE_F16_PROCESS();
}
_viv_uniform int input1_ZP;
_viv_uniform float input1Scale;
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
__kernel void bilinear_grid_sample_reflect_F16_U8toF16(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
vxc_uchar16 read_coord;
VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
float4 fxy0;
float4 fxy1;
unsigned char input1ZP;
_viv_asm(COPY, input1ZP, input1_ZP, 4);
VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
fxy0 = fxy0 * input1Scale;
fxy1 = fxy1 * input1Scale;
GRID_SAMPLE_F16_PROCESS();
}
_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
__kernel void bilinear_grid_sample_reflect_F16_F16toF16(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
vxc_short8 read_val;
vxc_half8 read_coord;
VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, read_coord, read_val, 16);
float4 fxy0;
float4 fxy1;
VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
GRID_SAMPLE_F16_PROCESS();
}

View File

@ -0,0 +1,224 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float2 half_input0_wh;
_viv_uniform float2 add_float_value;
_viv_uniform int depth;
_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
_viv_uniform float uint8Scale;
_viv_uniform float output_ZP;
_viv_uniform float4 span_wh;
_viv_uniform float4 min_val_wh;
#define GRID_SAMPLE_F16_to_U8_PROCESS() \
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy0 = fabs(fxy0 - min_val_wh); \
fxy1 = fabs(fxy1 - min_val_wh); \
float4 flips_xy0 = floor(fxy0 / span_wh); \
float4 flips_xy1 = floor(fxy1 / span_wh); \
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
int4 flips_int_xy0 = convert_int4(flips_xy0); \
int4 flips_int_xy1 = convert_int4(flips_xy1); \
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
float4 x_f = floor(in_x); \
float4 x_lerp = in_x - x_f; \
int4 x_idx = convert_int4(x_f); \
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
float4 y_f = floor(in_y); \
float4 y_lerp = in_y - y_f; \
int4 y_idx = convert_int4(y_f); \
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
int8 input_desc; \
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
int baseAddr = input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
vxc_short8 t0; \
vxc_short8 b0; \
vxc_uchar16 result; \
vxc_half8 top; \
vxc_half8 bottom; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, top, t0, 16); \
_viv_asm(COPY, bottom, b0, 16); \
float4 left4; \
float4 right4; \
float4 top4; \
float4 bottom4; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
int loop = depth - 1; \
while (coord_in.z < loop) \
{ \
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
top4 = right4 * x_lerp + left4; \
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
dst4 = dst4 * uint8Scale + output_ZP; \
int4 dst = convert_int4_rte(dst4); \
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
coord_in.x = x_idx.x; \
coord_in.y = y_idx.x; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, top, t0, 16); \
_viv_asm(COPY, bottom, b0, 16); \
} \
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
top4 = right4 * x_lerp + left4; \
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
dst4 = dst4 * uint8Scale + output_ZP; \
int4 dst = convert_int4_rte(dst4); \
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
__kernel void bilinear_grid_sample_reflect_F16_F32toU8(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
coord_in1.z = coord_in1.z + 4;
float4 fxy0 = read_imagef(input1, coord_in1.xy);
float4 fxy1 = read_imagef(input1, coord_in1.zw);
GRID_SAMPLE_F16_to_U8_PROCESS();
}
_viv_uniform int input1_ZP;
_viv_uniform float input1Scale;
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
__kernel void bilinear_grid_sample_reflect_F16_U8toU8(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
vxc_uchar16 read_coord;
VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
float4 fxy0;
float4 fxy1;
unsigned char input1ZP;
_viv_asm(COPY, input1ZP, input1_ZP, 4);
VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
fxy0 = fxy0 * input1Scale;
fxy1 = fxy1 * input1Scale;
GRID_SAMPLE_F16_to_U8_PROCESS();
}
_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
__kernel void bilinear_grid_sample_reflect_F16_F16toU8(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
vxc_short8 read_val;
vxc_half8 read_coord;
VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, read_coord, read_val, 16);
float4 fxy0;
float4 fxy1;
VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
GRID_SAMPLE_F16_to_U8_PROCESS();
}

View File

@ -0,0 +1,160 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float2 half_input0_wh;
_viv_uniform float2 add_float_value;
_viv_uniform int depth;
_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
_viv_uniform float input1_scale;
_viv_uniform float dfpScale;
_viv_uniform float4 span_wh;
_viv_uniform float4 min_val_wh;
#define GRID_SAMPLE_I16_PROCESS() \
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy0 = fabs(fxy0 - min_val_wh); \
fxy1 = fabs(fxy1 - min_val_wh); \
float4 flips_xy0 = floor(fxy0 / span_wh); \
float4 flips_xy1 = floor(fxy1 / span_wh); \
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
int4 flips_int_xy0 = convert_int4(flips_xy0); \
int4 flips_int_xy1 = convert_int4(flips_xy1); \
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
float4 x_f = floor(in_x); \
float4 x_lerp = in_x - x_f; \
int4 x_idx = convert_int4(x_f); \
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
float4 y_f = floor(in_y); \
float4 y_lerp = in_y - y_f; \
int4 y_idx = convert_int4(y_f); \
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
int8 input_desc; \
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
int baseAddr = input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
vxc_short8 top; \
vxc_short8 bottom; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
float4 left4; \
float4 right4; \
float4 top4; \
float4 bottom4; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
int loop = depth - 1; \
while (coord_in.z < loop) \
{ \
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
top4 = right4 * x_lerp + left4; \
VXC_DP4x4(left4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
dst4 = dst4 * dfpScale; \
int4 dst = convert_int4_rte(dst4); \
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
coord_in.x = x_idx.x; \
coord_in.y = y_idx.x; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
} \
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
top4 = right4 * x_lerp + left4; \
VXC_DP4x4(left4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
dst4 = dst4 * dfpScale; \
int4 dst = convert_int4_rte(dst4); \
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
__kernel void bilinear_grid_sample_reflect_I16_I16toI16(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
vxc_short8 read_coord;
VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
float4 fxy0;
float4 fxy1;
VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
fxy0 = fxy0 * input1_scale;
fxy1 = fxy1 * input1_scale;
GRID_SAMPLE_I16_PROCESS();
}

View File

@ -0,0 +1,160 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float2 half_input0_wh;
_viv_uniform float2 add_float_value;
_viv_uniform int depth;
_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
_viv_uniform float input1_scale;
_viv_uniform float dfpScale;
_viv_uniform float4 span_wh;
_viv_uniform float4 min_val_wh;
#define GRID_SAMPLE_I8_PROCESS() \
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy0 = fabs(fxy0 - min_val_wh); \
fxy1 = fabs(fxy1 - min_val_wh); \
float4 flips_xy0 = floor(fxy0 / span_wh); \
float4 flips_xy1 = floor(fxy1 / span_wh); \
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
int4 flips_int_xy0 = convert_int4(flips_xy0); \
int4 flips_int_xy1 = convert_int4(flips_xy1); \
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
float4 x_f = floor(in_x); \
float4 x_lerp = in_x - x_f; \
int4 x_idx = convert_int4(x_f); \
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
float4 y_f = floor(in_y); \
float4 y_lerp = in_y - y_f; \
int4 y_idx = convert_int4(y_f); \
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
int8 input_desc; \
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
int baseAddr = input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
vxc_char16 top; \
vxc_char16 bottom; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
float4 left4; \
float4 right4; \
float4 top4; \
float4 bottom4; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
int loop = depth - 1; \
while (coord_in.z < loop) \
{ \
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
top4 = right4 * x_lerp + left4; \
VXC_DP4x4(left4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
dst4 = dst4 * dfpScale; \
int4 dst = convert_int4_rte(dst4); \
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
coord_in.x = x_idx.x; \
coord_in.y = y_idx.x; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
} \
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
top4 = right4 * x_lerp + left4; \
VXC_DP4x4(left4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
dst4 = dst4 * dfpScale; \
int4 dst = convert_int4_rte(dst4); \
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
__kernel void bilinear_grid_sample_reflect_I8_I8toI8(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
vxc_char16 read_coord;
VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
float4 fxy0;
float4 fxy1;
VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
fxy0 = fxy0 * input1_scale;
fxy1 = fxy1 * input1_scale;
GRID_SAMPLE_I8_PROCESS();
}

View File

@ -0,0 +1,224 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float2 half_input0_wh;
_viv_uniform float2 add_float_value;
_viv_uniform int depth;
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
_viv_uniform int input_ZP;
_viv_uniform float uint8Scale;
_viv_uniform float output_ZP;
_viv_uniform int input1_ZP;
_viv_uniform float input1Scale;
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
_viv_uniform float4 span_wh;
_viv_uniform float4 min_val_wh;
#define GRID_SAMPLE_U8_PROCESS() \
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
fxy0 = fabs(fxy0 - min_val_wh); \
fxy1 = fabs(fxy1 - min_val_wh); \
float4 flips_xy0 = floor(fxy0 / span_wh); \
float4 flips_xy1 = floor(fxy1 / span_wh); \
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
int4 flips_int_xy0 = convert_int4(flips_xy0); \
int4 flips_int_xy1 = convert_int4(flips_xy1); \
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
float4 x_f = floor(in_x); \
float4 x_lerp = in_x - x_f; \
int4 x_idx = convert_int4(x_f); \
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
float4 y_f = floor(in_y); \
float4 y_lerp = in_y - y_f; \
int4 y_idx = convert_int4(y_f); \
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
int8 input_desc; \
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
int baseAddr = input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
vxc_uchar16 top; \
vxc_uchar16 bottom; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
float4 left4; \
float4 right4; \
float4 top4; \
float4 bottom4; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
int loop = depth - 1; \
while (coord_in.z < loop) \
{ \
unsigned char inputZP; \
_viv_asm(COPY, inputZP, input_ZP, 4); \
VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
top4 = right4 * x_lerp + left4; \
VXC_DP4x4(left4, bottom, inputZP, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
dst4 = dst4 * uint8Scale + output_ZP; \
int4 dst = convert_int4_rte(dst4); \
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
coord_in.x = x_idx.x; \
coord_in.y = y_idx.x; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.y; \
coord_in.y = y_idx.y; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.z; \
coord_in.y = y_idx.z; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = x_idx.w; \
coord_in.y = y_idx.w; \
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
} \
unsigned char inputZP; \
_viv_asm(COPY, inputZP, input_ZP, 4); \
VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
top4 = right4 * x_lerp + left4; \
VXC_DP4x4(left4, bottom, inputZP, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
bottom4 = right4 * x_lerp + left4; \
bottom4 -= top4; \
float4 dst4 = bottom4 * y_lerp + top4; \
dst4 = dst4 * uint8Scale + output_ZP; \
int4 dst = convert_int4_rte(dst4); \
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
__kernel void bilinear_grid_sample_reflect_U8_F32toU8(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
coord_in1.z = coord_in1.z + 4;
float4 fxy0 = read_imagef(input1, coord_in1.xy);
float4 fxy1 = read_imagef(input1, coord_in1.zw);
GRID_SAMPLE_U8_PROCESS();
}
__kernel void bilinear_grid_sample_reflect_U8_U8toU8(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
vxc_uchar16 read_coord;
VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
float4 fxy0;
float4 fxy1;
unsigned char input1ZP;
_viv_asm(COPY, input1ZP, input1_ZP, 4);
VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
fxy0 = fxy0 * input1Scale;
fxy1 = fxy1 * input1Scale;
GRID_SAMPLE_U8_PROCESS();
}
_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
__kernel void bilinear_grid_sample_reflect_U8_F16toU8(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int align_corners)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 coord_in1 = coord_out.xyxy;
coord_in1.xz = coord_in1.xz * 2;
vxc_short8 read_val;
vxc_half8 read_coord;
VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, read_coord, read_val, 16);
float4 fxy0;
float4 fxy1;
VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
GRID_SAMPLE_U8_PROCESS();
}

File diff suppressed because it is too large Load Diff

View File

@ -39,6 +39,7 @@
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_constraint_check.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_dtype_util_prv.h"
#include "vsi_nn_error.h"
#define _INPUT_NUM (1)
@ -199,14 +200,11 @@ static vsi_bool op_setup
vsi_nn_internal_node_t* curr = NULL;
float min = self->nn_param.clip.min;
float max = self->nn_param.clip.max;
uint32_t infinity = VSI_NN_FLOAT32_INF;
float neg_infinity = -*(float*)&infinity;
int32_t max_float = *(int32_t*)&max;
if ( (min == -1.0f && max == 1.0f)
|| (min == 0.0f && max == 6.0f)
|| (min == 0.0f && max_float == VSI_NN_FLOAT32_INF)
|| (min == neg_infinity && max_float == VSI_NN_FLOAT32_INF))
|| (min == 0.0f && fp32_is_inf(max))
|| (fp32_is_inf(-min) && fp32_is_inf(max)))
{
vsi_nn_internal_init_node_wksp(self);
if (min == -1.0f && max == 1.0f)
@ -217,7 +215,7 @@ static vsi_bool op_setup
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0);
}
else if (min == 0.0f && max_float == VSI_NN_FLOAT32_INF)
else if (min == 0.0f && fp32_is_inf(max))
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0);
}

View File

@ -430,10 +430,13 @@ static vsi_bool op_setup
// create activation output/hstate_output/cstate_output
vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.dtype, TRUE);
act_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
CHECK_PTR_FAIL_GOTO(act_out, "Create internal tensor failed", final);
vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_H_STATE]->attr.dtype, TRUE);
act_h_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
CHECK_PTR_FAIL_GOTO(act_h_out, "Create internal tensor failed", final);
vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_C_STATE]->attr.dtype, TRUE);
act_c_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
CHECK_PTR_FAIL_GOTO(act_c_out, "Create internal tensor failed", final);
curr->outputs[LSTMUNIT_ACT_OUTPUT] = act_out->t;
curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = act_h_out->t;
curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = act_c_out->t;

View File

@ -89,7 +89,7 @@ static vsi_status op_grouped_compute
if (NULL == LOCAL()->weight_tensor_group)
{
VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
goto final;
}
memset(LOCAL()->weight_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 2,
@ -325,8 +325,8 @@ static vsi_status op_compute
#endif
// param.a_x = self->nn_param.deconv.dilation;
// param.a_y = self->nn_param.deconv.dilation;
param.ext.khr.a_x = 1;
param.ext.khr.a_y = 1;
param.ext.khr.a_x = (size_t)self->nn_param.deconv.output_padding[0];
param.ext.khr.a_y = (size_t)self->nn_param.deconv.output_padding[1];
param.ext.khr.padding_x = (size_t)self->nn_param.deconv.pad[0];
param.ext.khr.padding_y = (size_t)self->nn_param.deconv.pad[2];
param.ext.khr.overflow_policy = self->vx_param.overflow_policy;
@ -336,6 +336,7 @@ static vsi_status op_compute
param.ext.channel_group = self->nn_param.deconv.group;
param.stride_x = self->nn_param.deconv.stride[0];
param.stride_y = self->nn_param.deconv.stride[1];
param.down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
//param.border_mode;
//param.border_const;

View File

@ -66,7 +66,7 @@ static vsi_status op_compute
input_tensor[1] = tmp_tensor;
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
"signal_frame",
"extra_ending",
input_tensor, 2,
outputs, 1, NULL );

View File

@ -65,9 +65,15 @@ static vsi_status op_compute
}
else
{
#define _TENSOR_LEN 64
vsi_nn_tensor_attr_t attr;
vsi_nn_tensor_t* temp_tensors = NULL;
char gather_tensor_name[_TENSOR_LEN];
char copy_tensor_name[_TENSOR_LEN];
memset(gather_tensor_name, 0, sizeof(gather_tensor_name));
memset(copy_tensor_name, 0, sizeof(copy_tensor_name));
VSILOGW("gather is no_range_change operation! \
Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
@ -78,7 +84,20 @@ static vsi_status op_compute
temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, &temp_tensors, 1, param );
snprintf(gather_tensor_name, sizeof(gather_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
if(vxSetReferenceName((vx_reference)temp_tensors->t, gather_tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u gather node output name fail", self->uid);
return VSI_FAILURE;
}
n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
snprintf(copy_tensor_name, sizeof(copy_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
if(vxSetReferenceName((vx_reference)outputs[0]->t, copy_tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u copy node output name fail", self->uid);
return VSI_FAILURE;
}
vsi_safe_release_tensor(temp_tensors);
}

View File

@ -38,6 +38,7 @@
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#include "utils/vsi_nn_constraint_check.h"
#include "vsi_nn_error.h"
typedef struct _gather_elements_local_data_t {
int32_t placeholder;
@ -101,6 +102,7 @@ static vsi_status op_compute
attr.is_const = FALSE;
attr.vtl = TRUE;
temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
CHECK_PTR_FAIL_GOTO( temp_tensors, "Create tensor fail.", final );
}
else
{
@ -148,6 +150,7 @@ static vsi_status op_compute
vsi_safe_release_tensor(temp_tensors);
}
final:
vsi_nn_kernel_param_release( &param );
if ( self->n )

View File

@ -53,11 +53,13 @@ static vsi_status op_compute
vsi_nn_kernel_param_t* param = NULL;
int32_t align_corners = self->nn_param.gridsample.align_corners;
int32_t pad_mode = (int32_t)self->nn_param.gridsample.padding_mode;
vsi_nn_kernel_node_t n;
char kernel_name[128];
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
vsi_nn_kernel_param_add_int32(param, "padding_mode", pad_mode);
switch (self->nn_param.gridsample.mode) {
case VSI_NN_INTERPOLATION_BILINEAR:
@ -103,13 +105,20 @@ static vsi_bool op_check
return FALSE;
}
if (!((VSI_NN_PAD_MODE_CONSTANT ==
if ((VSI_NN_PAD_MODE_CONSTANT ==
self->nn_param.gridsample.padding_mode) &&
(0 == self->nn_param.gridsample.const_val))) {
(0 != self->nn_param.gridsample.const_val)) {
VSILOGE("Only support padding const 0 now!");
return FALSE;
}
if (VSI_NN_PAD_MODE_SYMMETRIC ==
self->nn_param.gridsample.padding_mode) {
VSILOGE("Can't support VSI_NN_PAD_MODE_SYMMETRIC now!");
return FALSE;
}
return TRUE;
} /* op_check() */
@ -124,6 +133,11 @@ static vsi_bool op_setup
return FALSE;
}
if (2 != inputs[1]->attr.size[0])
{
return FALSE;
}
if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) {
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
outputs[0]->attr.size[0] = inputs[1]->attr.size[1];
@ -133,6 +147,16 @@ static vsi_bool op_setup
outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
}
}
else
{
if ((outputs[0]->attr.dim_num != inputs[0]->attr.dim_num)
|| (outputs[0]->attr.size[0] != inputs[1]->attr.size[1])
|| (outputs[0]->attr.size[1] != inputs[1]->attr.size[2]))
{
return FALSE;
}
}
return TRUE;
} /* op_setup() */

View File

@ -121,6 +121,7 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
#define _TENSOR_LEN 64
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_kernel_node_t n = NULL;
@ -129,15 +130,36 @@ static vsi_status op_compute
vsi_size_t new_rank = 0;
vsi_nn_tensor_t * tmp_tensors[4] = {NULL};
char reshape0_tensor_name[_TENSOR_LEN];
char reshape1_tensor_name[_TENSOR_LEN];
char instance_norm_tensor_name[_TENSOR_LEN];
memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name));
memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name));
memset(instance_norm_tensor_name, 0, sizeof(instance_norm_tensor_name));
vsi_nn_optimize_instance_norm_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);
tmp_tensors[0] = vsi_nn_kernel_insert_reshape_node( self->graph,
inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD );
snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
if(vxSetReferenceName((vx_reference)tmp_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u reshape 0 node output name fail", self->uid);
return VSI_FAILURE;
}
tmp_tensors[1] = inputs[1];
tmp_tensors[2] = inputs[2];
tmp_tensors[3] = vsi_nn_kernel_insert_reshape_node( self->graph,
outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD );
snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
if(vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u reshap 1 node output name fail", self->uid);
return VSI_FAILURE;
}
status = _try_set_high_presision_tensor(tmp_tensors);
if (status != VSI_SUCCESS)
{
@ -155,6 +177,12 @@ static vsi_status op_compute
self->n = (vx_node)n;
status = VSI_SUCCESS;
}
snprintf(instance_norm_tensor_name, sizeof(instance_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2);
if(vxSetReferenceName((vx_reference)tmp_tensors[3]->t, instance_norm_tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u instance_norm node output name fail", self->uid);
return VSI_FAILURE;
}
if (param != NULL)
{

View File

@ -259,9 +259,9 @@ static vsi_status op_deinit
vsi_nn_node_t * self
)
{
vsi_nn_safe_free(self->nn_param.layernorm.local);
#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
vsi_nn_safe_free(self->nn_param.layernorm.local);
vsi_nn_internal_deinit_node_wksp( self );
#endif

View File

@ -161,14 +161,6 @@ static vsi_bool _check_is_sp_supported_type
{
int32_t * axes = self->nn_param.reduce.local2->axes;
int32_t axes_num = self->nn_param.reduce.local2->axes_num;
vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = { {0} };
int32_t axis_in[VSI_NN_MAX_DIM_NUM] = {0};
int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
int32_t i = 0;
uint32_t axis_size = 0;
uint32_t rank_in = 0;
uint32_t rank_out = 0;
vsi_bool ret = FALSE;
if ( !self->graph->ctx->config.support_stream_processor ||
(type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
@ -191,22 +183,15 @@ static vsi_bool _check_is_sp_supported_type
return FALSE;
}
for (i = 0; i < axes_num; i++)
if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2)) ||
(axes_num == 2 && ((axes[0] < 2 && axes[1] < 2) || (axes[0] == 1 && axes[1] == 2))) )
{
shapes[0][i] = input->attr.size[axes[i]];
shapes[1][i] = 1;
axis_in[i] = i;
return TRUE;
}
ret = vsi_nn_kernel_optimize_reduce_shape(
shapes[0], axes_num,
axis_in, axes_num,
shapes[1], axes_num,
shapes[2], &rank_in, shapes[3], &rank_out,
new_axis, &axis_size);
return ret && axis_size < 3;
return FALSE;
}
static vsi_status op_compute
(
vsi_nn_node_t * self,
@ -839,82 +824,28 @@ static vsi_bool op_set_sp_reduce_internal
vsi_enum type_name
)
{
vsi_nn_tensor_attr_t attr;
vsi_nn_internal_tensor_t* tensor1 = NULL;
vsi_nn_tensor_t* new_output = NULL;
uint32_t* permute_in_perm = NULL;
int32_t * new_axis = NULL;
vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = {1};
int32_t use_virtual_tensor = TRUE;
vsi_nn_internal_node_t* tmp_inode = NULL;
int32_t * axes = self->nn_param.reduce.local2->axes;
int32_t axes_num = self->nn_param.reduce.local2->axes_num;
int32_t i = 0, j = 0, index = 0;
vsi_size_t reduce_size = 1;
vsi_bool ret = FALSE;
int32_t i = 0;
vsi_nn_internal_init_node_wksp( self );
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
memcpy(&attr.dtype, &inputs[0]->attr.dtype, sizeof(vsi_nn_dtype_t));
attr.dim_num = VSI_NN_DIM_AUTO;
attr.vtl = use_virtual_tensor;
attr.is_const = FALSE;
tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 );
tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode,
inputs[0]->attr.dim_num * sizeof(uint32_t));
CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create buffer failed", final);
tmp_inode->inputs[0] = inputs[0];
tmp_inode->outputs[0] = outputs[0];
tmp_inode->node->nn_param.reduce_mean_internal.axis = axes;
tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num;
tmp_inode->node->nn_param.reduce_mean_internal.type = type_name;
for ( i = 0; i < axes_num; i++)
for (i = 0; i < axes_num; i++)
{
shapes[index] = outputs[0]->attr.size[axes[i]];
permute_in_perm[index ++] = axes[i];
reduce_size *= inputs[0]->attr.size[axes[i]];
}
for ( j = 0; j < (int32_t)inputs[0]->attr.dim_num; j++)
{
for (i = 0; i < axes_num; i++)
{
if (j == axes[i])
{
break;
}
}
if (i == axes_num)
{
shapes[index] = outputs[0]->attr.size[j];
permute_in_perm[index ++] = j;
}
}
tmp_inode->node->nn_param.permute.perm = permute_in_perm;
tmp_inode->node->nn_param.permute.dim_num = inputs[0]->attr.dim_num;
tmp_inode->inputs[0] = inputs[0];
tmp_inode->outputs[0] = tensor1->t;
vsi_nn_internal_setup_node(self, tmp_inode);
new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes, outputs[0]->attr.dim_num);
CHECK_PTR_FAIL_GOTO(new_output, "Create tensor failed", final);
self->nn_param.reduce.local2->reshaped_output = new_output;
tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
new_axis = (int32_t *)vsi_nn_internal_new_node_param(tmp_inode,
axes_num * sizeof(int32_t));
CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(new_axis, tmp_inode, "Create buffer failed", final);
for (i = 0; i < axes_num; i++)
{
new_axis[i] = i;
}
tmp_inode->inputs[0] = tensor1->t;
tmp_inode->outputs[0] = new_output;
tmp_inode->node->nn_param.reduce_mean_internal.axis = new_axis;
tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num;
tmp_inode->node->nn_param.reduce_mean_internal.type = type_name;
if (type_name == VSI_NN_REDUCE_SUM)
{
tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f;
@ -1147,6 +1078,7 @@ static vsi_bool op_set_reduce_internal
re_sizes[axes[2]] = 1;
new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num);
}
self->nn_param.reduce.local2->reshaped_output = new_output;
curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
@ -1161,7 +1093,6 @@ static vsi_bool op_set_reduce_internal
curr->inputs[0] = tmp_output_tensor[1]->t;
}
curr->outputs[0] = new_output;
self->nn_param.reduce.local2->reshaped_output = new_output;
vsi_nn_internal_setup_node(self, curr);
}
else

View File

@ -136,7 +136,7 @@ static vsi_bool op_setup
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
reducesum_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
CHECK_PTR_FAIL_GOTO(square_tensor, "Create internal tensor failed", final);
CHECK_PTR_FAIL_GOTO(reducesum_tensor, "Create internal tensor failed", final);
reducesum_node = vsi_nn_internal_new_node( self, VSI_NN_OP_REDUCE, 0, 0);
CHECK_PTR_FAIL_GOTO(reducesum_node, "Create internal node failed", final);

View File

@ -37,6 +37,7 @@
#include "vsi_nn_error.h"
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util_prv.h"
static vsi_status op_compute
@ -88,7 +89,6 @@ static vsi_bool op_setup
float alpha = 0;
float max_value = 0;
float threshold = 0;
uint32_t max_raw = 0;
vsi_bool ret = FALSE;
if ( NULL == self )
@ -101,11 +101,9 @@ static vsi_bool op_setup
max_value = p->max_value;
threshold = p->threshold;
max_raw = *(uint32_t*)&max_value;
vsi_nn_internal_init_node_wksp(self);
if (alpha == 0 && max_raw == VSI_NN_FLOAT32_INF && threshold == 0)
if (alpha == 0.0f && fp32_is_inf(max_value) && threshold == 0.0f)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0);
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
@ -119,14 +117,14 @@ static vsi_bool op_setup
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
}
else if (alpha == 0 && max_value == 6.0f && threshold == 0)
else if (alpha == 0.0f && max_value == 6.0f && threshold == 0.0f)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0);
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
}
else if (alpha == 0.1 && max_value == VSI_NN_FLOAT32_INF && threshold == 0)
else if (alpha == 0.1f && max_value == VSI_NN_FLOAT32_INF && threshold == 0.0f)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LEAKY_RELU, 0, 0);
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);

View File

@ -70,7 +70,11 @@ static vsi_status op_compute
self->graph,
(uint8_t *)self->nn_param.reshape.size,
&attr);
if (NULL == dims_tensor)
{
VSILOGE( "Create tensor fail." );
return VSI_FAILURE;
}
reshape_param.dims = REQUIRED_IO(dims_tensor);
self->n = vxTensorReshapeNode(self->graph->g,

View File

@ -75,6 +75,11 @@ static vsi_status op_compute
self->graph,
(uint8_t *)dims_data,
&attr);
if (NULL == dims_tensor)
{
VSILOGE( "Create tensor fail." );
return VSI_FAILURE;
}
reshape_param.dims = REQUIRED_IO(dims_tensor);

View File

@ -52,6 +52,7 @@ static vsi_status op_compute
vsi_status status = VSI_FAILURE;
int32_t align_corners = self->nn_param.resize_internal.align_corners;
int32_t half_pixel_centers = self->nn_param.resize_internal.half_pixel_centers;
int32_t type = self->nn_param.resize_internal.type;
vsi_enum layout = self->nn_param.resize_internal.layout;
vsi_nn_kernel_param_t * param = NULL;
@ -59,6 +60,7 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_int32( param, "align_corners", align_corners );
vsi_nn_kernel_param_add_int32( param, "half_pixel_centers", half_pixel_centers );
vsi_nn_kernel_param_add_int32( param, "type", type );
if (layout == VSI_NN_RESIZE_LAYOUT_NCHW)
{
@ -186,6 +188,7 @@ static vsi_status op_init
vsi_status status = VSI_SUCCESS;
self->nn_param.resize_internal.layout = VSI_NN_RESIZE_LAYOUT_NCHW;
self->nn_param.resize_internal.type = VSI_NN_INTERPOLATION_BILINEAR;
return status;
} /* op_init() */

View File

@ -0,0 +1,202 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_node.h"
#include "utils/vsi_nn_math.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_tensor_util_prv.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_constraint_check.h"
#include "vsi_nn_error.h"
/*
Declare number of input and output.
*/
#define _INPUT_NUM (2)
#define _OUTPUT_NUM (1)
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t* param = NULL;
vsi_nn_kernel_node_t n = NULL;
float eps = self->nn_param.rmsnorm.eps;
int32_t axis = self->nn_param.rmsnorm.axis;
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_float32(param, "eps", eps);
vsi_nn_kernel_param_add_int32(param, "axis", axis);
n = vsi_nn_kernel_selector(self->graph, "rms_norm",
inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param);
if (n != NULL)
{
self->n = (vx_node)n;
status = VSI_SUCCESS;
}
if (param != NULL)
{
vsi_nn_kernel_param_release(&param);
}
return status;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
if (!ret)
{
BEGIN_IO_TYPE_DECL(RMS_NORM, 2, 1)
IO_TYPE(D_F32, D_F32, D_F32)
IO_TYPE(D_F16, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
IO_TYPE(D_BF16, D_F32, D_BF16)
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
END_IO_TYPE_DECL(RMS_NORM)
if (!VALIDATE_OP_IO_TYPES(RMS_NORM, self, inputs, self->input.num, outputs, self->output.num))
{
char* desc = generate_op_io_types_desc(inputs,
self->input.num, outputs, self->output.num);
VSILOGE("Inputs/Outputs data type not support: %s", desc);
destroy_op_io_types_desc(desc);
return FALSE;
}
}
return TRUE;
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_bool ret = TRUE;
if (NULL == self)
{
return FALSE;
}
ret = vsi_nn_op_common_setup(self, inputs, outputs);
return ret;
} /* op_setup() */
static vsi_status op_init
(
vsi_nn_node_t* self
)
{
self->nn_param.rmsnorm.axis = 0;
self->nn_param.rmsnorm.eps = 1e-8f;
return VSI_SUCCESS;
} /* op_init() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ RMSNORM,
/* init */ op_init,
/* compute */ op_compute,
/* deinit */ vsi_nn_op_common_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
__END_DECLS

View File

@ -0,0 +1,196 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_node.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_test.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
typedef struct _shape_local_data_t {
vsi_nn_tensor_t *shape_tensor;
} shape_local_data_t;
/*
Declare number of input and output.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
VSI_UNREFERENCED(inputs);
VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
VSI_UNREFERENCED(self);
VSI_UNREFERENCED(inputs);
if (outputs[0]->attr.dtype.vx_type != VSI_NN_TYPE_INT32)
{
VSILOGD("Outputs data type not support");
return FALSE;
}
return TRUE;
} /* op_check() */
static vsi_status op_optimize
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs,
vsi_nn_opt_direction_e direction
)
{
VSI_UNREFERENCED(inputs);
VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_bool ret = TRUE;
vsi_nn_internal_node_t* curr = NULL;
vsi_nn_tensor_attr_t attr;
vx_int32 shapes[VSI_NN_MAX_DIM_NUM] = {0};
uint32_t i = 0;
for ( i = 0; i < inputs[0]->attr.dim_num; i++ )
{
shapes[i] = (int32_t)inputs[0]->attr.size[i];
}
memset(&attr, 0, sizeof(attr));
attr.size[0] = inputs[0]->attr.dim_num;
attr.dim_num = 1;
attr.is_const = TRUE;
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
self->nn_param.shape.local->shape_tensor = vsi_nn_CreateTensorFromData(
self->graph,
(uint8_t *)shapes,
&attr);
if ( NULL == self->nn_param.shape.local->shape_tensor )
{
VSILOGE("Create shape_tensor fail.(shape)");
goto final;
}
vsi_nn_internal_init_node_wksp(self);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
if (NULL == curr)
{
return ret;
}
curr->inputs[0] = self->nn_param.shape.local->shape_tensor;
curr->outputs[0] = outputs[0];
ret = vsi_nn_internal_setup_node(self, curr);
final:
return ret;
} /* op_setup() */
static vsi_status op_init
(
vsi_nn_node_t* self
)
{
self->nn_param.shape.local =
(shape_local_data_t *)malloc(sizeof(shape_local_data_t));
if (NULL == self->nn_param.shape.local)
{
return VSI_FAILURE;
}
memset( self->nn_param.shape.local, 0, sizeof(shape_local_data_t) );
return VSI_SUCCESS;
} /* op_init() */
static vsi_status op_deinit
(
vsi_nn_node_t * self
)
{
if (self->nn_param.shape.local)
{
vsi_safe_release_tensor(self->nn_param.shape.local->shape_tensor);
}
vsi_nn_safe_free(self->nn_param.shape.local);
vsi_nn_internal_deinit_node_wksp(self);
vsi_nn_op_common_deinit(self);
return VSI_SUCCESS;
} /* op_deinit() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ SHAPE,
/* init */ op_init,
/* compute */ op_compute,
/* deinit */ op_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ op_optimize,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
__END_DECLS

View File

@ -735,6 +735,15 @@ static vsi_bool op_setup
outputs[0]->attr.dim_num++;
}
/*output dim_num is 0, the tensor should be scalar!*/
if (outputs[0]->attr.dim_num == 0)
{
outputs[0]->attr.dim_num = 1;
outputs[0]->attr.size[0] = 1;
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
}
}
_get_stride_slice_start_stop_stride(self, inputs, outputs);

View File

@ -124,9 +124,6 @@ static vsi_status op_compute
outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
shapes[1], &rank_out, &new_axis1);
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "top_k", top_k );
if (ret)
{
uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0};
@ -195,10 +192,14 @@ static vsi_status op_compute
outputs_tensor[1] = reshape_tensors[2];
}
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "top_k", top_k );
self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk",
&input_tensor, _INPUT_NUM,
outputs_tensor, _OUTPUT_NUM, param );
vsi_nn_kernel_param_release( &param );
if (axis != 0)
{
_create_permute_node(self, outputs_tensor[0], reshape_tensors[1], perm_out, rank_in, TRUE);

View File

@ -25,6 +25,7 @@
#include <string.h>
#include <stdlib.h>
#include <float.h>
#include "vsi_nn_types.h"
#include "vsi_nn_log.h"
@ -47,8 +48,6 @@ typedef struct _upsamplescale_local_data_t {
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
#define _EPSILON 1e-8
static vsi_status op_compute
(
vsi_nn_node_t * self,
@ -69,7 +68,7 @@ static vsi_status op_compute
stride = self->nn_param.upsamplescale.stride;
scale = self->nn_param.upsamplescale.scale;
if (stride == 1 || vsi_nn_abs(scale - 1.0f) == _EPSILON)
if (stride == 1 || vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
{
return vsi_nn_internal_compute_node( self );
}
@ -148,7 +147,7 @@ static vsi_status op_optimize
VSI_UNREFERENCED(inputs);
VSI_UNREFERENCED(outputs);
if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
if (stride == 1 && vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
{
return vsi_nn_internal_optimize_node( self, direction );
}
@ -174,7 +173,7 @@ static vsi_bool op_setup
vsi_nn_internal_init_node_wksp(self);
if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
if (stride == 1 && vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
@ -194,7 +193,7 @@ static vsi_bool op_setup
ret = vsi_nn_internal_setup_node(self, curr);
}
else if (vsi_nn_abs(scale - 1.0f) == _EPSILON)
else if (vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESIZE, 0, 0);
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);

View File

@ -469,6 +469,8 @@ static _op_param_gen_t s_op_gen[] =
/* REDUCEL2 */ NULL,
/* CROP_AND_RESIZE */ NULL,
/* TAN */ NULL,
/* RMSNORM */ NULL,
/* SHAPE */ NULL,
};
_compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );

View File

@ -46,6 +46,11 @@
#include "utils/vsi_nn_math.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_dtype_util_prv.h"
#if (defined(__ANDROID__)) && (__ANDROID_API__ > 21)
#include <sys/system_properties.h>
#endif
typedef struct _vx_status_desc_t
{
@ -387,10 +392,11 @@ float vsi_nn_DataAsFloat32
)
{
float val;
uint32_t *p = (uint32_t*)(&val);
fp32_bit_cast_t fp32_bit_cast;
int16_t fp16;
*p = 0xFFFFFFFF;
fp32_bit_cast.data = 0xFFFFFFFF;
val = fp32_bit_cast.val;
switch( type )
{
case VSI_NN_TYPE_BOOL8:
@ -1462,11 +1468,15 @@ void vsi_nn_get_tensor_clamp_min_max
}
else
{
uint32_t f32_min = 0xff800000;
uint32_t f32_max = 0x7f800000;
fp32_bit_cast_t fp32_bit_cast;
float pos_infinity;
float neg_infinity;
fp32_bit_cast.data = VSI_NN_FLOAT32_INF;
pos_infinity = fp32_bit_cast.val;
neg_infinity = -pos_infinity;
*clampMin = *(float*)&f32_min;
*clampMax = *(float*)&f32_max;
*clampMin = neg_infinity;
*clampMax = pos_infinity;
}
}

View File

@ -93,7 +93,7 @@ final:
return status;
}
#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30)
#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER";
static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK";
static const char* ENV_ENABLE_CONCAT_OPTIMIZE = "vendor.VSI_NN_ENABLE_CONCAT_OPTIMIZE";

View File

@ -799,6 +799,7 @@ static vsi_status batchInference_graph
original_inputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io);
original_outputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io);
approximateConstTensor = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t) * graph->tensor_num);
CHECK_PTR_FAIL_GOTO(approximateConstTensor, "Malloc fail.", final);
memset(approximateConstTensor, -1, sizeof(vsi_nn_tensor_id_t) * graph->tensor_num);
if (NULL == inputs || NULL == outputs || NULL == original_inputs_attr || NULL == original_outputs_attr)
@ -878,6 +879,7 @@ static vsi_status batchInference_graph
vsi_size_t iterator_list_index = 0;
vsi_size_t list_index = 0;
vsi_size_t* iterator_list = (vsi_size_t*)malloc(sizeof(vsi_size_t) * (batchNum + 1));
CHECK_PTR_FAIL_GOTO(iterator_list, "Malloc fail.", final);
memset(iterator_list, 0, sizeof(uint32_t) * (batchNum + 1));
if (((vsi_nn_node_prv_t*)node)->split_num > 0)
@ -885,6 +887,7 @@ static vsi_status batchInference_graph
iterator_list[iterator_list_index++] = ((vsi_nn_node_prv_t*)node)->split_num;
if (((vsi_nn_node_prv_t*)node)->split_num == 1)
{/*if user set split_num = 1, there is no need to batch split.*/
vsi_nn_safe_free(iterator_list);
continue;
}
}
@ -1015,6 +1018,7 @@ static vsi_status batchInference_graph
}
}
vsi_nn_safe_free(iterator_list);
/*restore node input batch number*/
num_of_node_inputs = node->input.num;
for (k = 0; k < num_of_node_inputs; k++)
@ -1053,7 +1057,7 @@ static vsi_status batchInference_graph
}
}
final:
final:
for (i = 0; i < graph->node_num; i++)
{
node_id = nodes_list[i];
@ -1067,7 +1071,7 @@ static vsi_status batchInference_graph
node->input.num, inputs);
vsi_nn_GetTensors(graph, node->output.tensors,
node->output.num, outputs);
for (j = 0; j < node->output.num; j++)
for (j = 0; outputs && j < node->output.num; j++)
{
if (outputs[j] == NULL)
{

View File

@ -716,6 +716,12 @@ vsi_status vsi_nn_internal_optimize_node
for ( i = n - 1; i >= 0; i-- )
{
curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)WKSP(node), i);
if ( NULL == curr )
{
VSILOGE("get point fail");
status = VSI_FAILURE;
break;
}
VSILOGD("Optimize backward for node uid[%u] sub_uid[%u] op[%s]",
node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op));

View File

@ -29,7 +29,7 @@
#include "vsi_nn_log.h"
#include "vsi_nn_types.h"
#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30)
#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
static const char* ENV_LOG_LEVEL = "vendor.VSI_NN_LOG_LEVEL";
#else
static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL";

View File

@ -1013,6 +1013,7 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParamForCropOnly
sizeof(numParams));
if (VSI_SUCCESS != status)
{
vsi_nn_safe_free(nodes);
goto final;
}
for (p = 0; p < numParams; p++)

View File

@ -3258,6 +3258,7 @@ static vsi_bool _init_dummy_tensor
#endif
// This is a hack that driver doesn't support const scales
scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim);
CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
memcpy(scales,
tensor->attr.dtype.scales,
tensor->attr.dtype.scale_dim * sizeof(float));