Updata internal ovxlib to rel1.2.6 (#690)
Internal ovxlib commit hash: c5d3e69356579fc7b595a7c0939fc7e4e0aaab5a Type: Code Improvement Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
parent
3ea908ca6d
commit
3b80968fb1
|
|
@ -197,3 +197,5 @@ DEF_OP(RESIZE_3D)
|
|||
DEF_OP(REDUCEL2)
|
||||
DEF_OP(CROP_AND_RESIZE)
|
||||
DEF_OP(TAN)
|
||||
DEF_OP(RMSNORM)
|
||||
DEF_OP(SHAPE)
|
||||
|
|
|
|||
|
|
@ -26,6 +26,8 @@
|
|||
#define _VSI_NN_KERNEL_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_ops.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
|
|
@ -81,6 +83,7 @@ typedef enum
|
|||
U4,
|
||||
FP8_E4M3,
|
||||
FP8_E5M2,
|
||||
INVALID_DTYPE,
|
||||
} VSI_PUBLIC_TYPE vsi_nn_kernel_dtype_e;
|
||||
|
||||
typedef enum
|
||||
|
|
@ -532,9 +535,8 @@ static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
|
|||
return FP8_E5M2;
|
||||
default:
|
||||
VSILOGE("error data type %d", dtype);
|
||||
break;
|
||||
return INVALID_DTYPE;
|
||||
}
|
||||
return I8;
|
||||
} /* vsi_nn_kernel_map_dtype() */
|
||||
|
||||
static VSI_INLINE_API vsi_nn_type_e vsi_nn_dtype_map_kernel
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ typedef struct _vsi_nn_resize_internal_param
|
|||
vsi_bool half_pixel_centers;
|
||||
float factor;
|
||||
vsi_enum layout;
|
||||
vsi_enum type;
|
||||
} vsi_nn_resize_internal_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -0,0 +1,54 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_RMSNORM_H
|
||||
#define _VSI_NN_OP_RMSNORM_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
typedef struct _rmsnorm_local_data_t {
|
||||
int32_t placeholder;
|
||||
} rmsnorm_local_data_t;
|
||||
|
||||
typedef struct _vsi_nn_rmsnorm_param
|
||||
{
|
||||
struct _rmsnorm_local_data_t* local;
|
||||
float eps;
|
||||
int32_t axis;
|
||||
} vsi_nn_rmsnorm_param;
|
||||
|
||||
_compiler_assert(offsetof(vsi_nn_rmsnorm_param, local) == 0, \
|
||||
vsi_nn_rmsnorm_h );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_SHAPE_H
|
||||
#define _VSI_NN_OP_SHAPE_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_shape_param
|
||||
{
|
||||
struct _shape_local_data_t* local;
|
||||
// Add parameters here
|
||||
} vsi_nn_shape_param;
|
||||
_compiler_assert(offsetof(vsi_nn_shape_param, local) == 0, \
|
||||
vsi_nn_shape_h );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -33,6 +33,14 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* A helper union for fp32 bit casting.
|
||||
*/
|
||||
typedef union {
|
||||
float val;
|
||||
uint32_t data;
|
||||
} fp32_bit_cast_t;
|
||||
|
||||
static VSI_INLINE_API vsi_bool type_is_integer
|
||||
(
|
||||
const vsi_nn_type_e type
|
||||
|
|
@ -203,9 +211,11 @@ static VSI_INLINE_API vsi_bool fp32_is_inf
|
|||
float val
|
||||
)
|
||||
{
|
||||
uint32_t u_value = *(uint32_t*)&val;
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
fp32_bit_cast.val = val;
|
||||
uint32_t fp32_data = fp32_bit_cast.data;
|
||||
|
||||
if ((u_value & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
|
||||
if ((fp32_data & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
|
||||
{
|
||||
return TRUE;
|
||||
}
|
||||
|
|
@ -232,7 +242,9 @@ static VSI_INLINE_API int32_t fp32_to_affine
|
|||
|
||||
if (fp32_is_inf(in) != 0)
|
||||
{
|
||||
uint32_t sign = (*(uint32_t*)&in) >> 31;
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
fp32_bit_cast.val = in;
|
||||
uint32_t sign = fp32_bit_cast.data >> 31;
|
||||
data = sign == 1 ? (int32_t)min_range : (int32_t)max_range;
|
||||
}
|
||||
|
||||
|
|
@ -277,7 +289,9 @@ static VSI_INLINE_API int32_t fp32_to_dfp
|
|||
|
||||
if (fp32_is_inf(in) != 0)
|
||||
{
|
||||
uint32_t sign = (*(uint32_t*)&in) >> 31;
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
fp32_bit_cast.val = in;
|
||||
uint32_t sign = fp32_bit_cast.data >> 31;
|
||||
data = sign == 1 ? (int32_t)min_range : (int32_t) max_range;
|
||||
}
|
||||
|
||||
|
|
@ -373,8 +387,9 @@ static VSI_INLINE_API float bfp16_to_fp32
|
|||
int16_t in
|
||||
)
|
||||
{
|
||||
int32_t t1, t2, t3;
|
||||
uint32_t t1, t2, t3;
|
||||
float out;
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
|
||||
t1 = in & 0x00FF; // Mantissa
|
||||
t2 = in & 0xFF00; // Sign bit + Exponent
|
||||
|
|
@ -384,9 +399,10 @@ static VSI_INLINE_API float bfp16_to_fp32
|
|||
t2 <<= 16; // Shift (sign + Exponent) bit into position
|
||||
t1 |= t2; // Re-insert (sign + Exponent) bit
|
||||
|
||||
*((uint32_t*)&out) = t1;
|
||||
fp32_bit_cast.data = t1;
|
||||
out = fp32_bit_cast.val;
|
||||
|
||||
return t3 == 0 ? 0 : out;
|
||||
return t3 == 0 ? 0.0f : out;
|
||||
} /* bfp16_to_fp32() */
|
||||
|
||||
static VSI_INLINE_API uint16_t fp32_to_fp16
|
||||
|
|
@ -394,10 +410,12 @@ static VSI_INLINE_API uint16_t fp32_to_fp16
|
|||
float in
|
||||
)
|
||||
{
|
||||
uint32_t fp32 = *((uint32_t *) &in);
|
||||
uint32_t t1 = (fp32 & 0x80000000u) >> 16; /* sign bit. */
|
||||
uint32_t t2 = (fp32 & 0x7F800000u) >> 13; /* Exponent bits */
|
||||
uint32_t t3 = (fp32 & 0x007FE000u) >> 13; /* Mantissa bits, no rounding */
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
fp32_bit_cast.val = in;
|
||||
uint32_t fp32_data = fp32_bit_cast.data;
|
||||
uint32_t t1 = (fp32_data & 0x80000000u) >> 16; /* sign bit. */
|
||||
uint32_t t2 = (fp32_data & 0x7F800000u) >> 13; /* Exponent bits */
|
||||
uint32_t t3 = (fp32_data & 0x007FE000u) >> 13; /* Mantissa bits, no rounding */
|
||||
uint32_t fp16 = 0u;
|
||||
if( t2 >= 0x023c00u )
|
||||
{
|
||||
|
|
@ -420,8 +438,10 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16
|
|||
float in
|
||||
)
|
||||
{
|
||||
uint32_t fp32 = *((unsigned int *) &in);
|
||||
uint32_t t1 = fp32 >> 16;
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
fp32_bit_cast.val = in;
|
||||
uint32_t fp32_data = fp32_bit_cast.data;
|
||||
uint32_t t1 = fp32_data >> 16;
|
||||
|
||||
return (uint16_t) t1;
|
||||
} /* fp32_to_bfp16() */
|
||||
|
|
@ -435,10 +455,12 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
|
|||
Convert a float point to bfloat16, with round-nearest-to-even as rounding method.
|
||||
*/
|
||||
|
||||
uint32_t fp32 = *((unsigned int *) &in);
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
fp32_bit_cast.val = in;
|
||||
uint32_t fp32_data = fp32_bit_cast.data;
|
||||
uint16_t out;
|
||||
|
||||
uint32_t lsb = (fp32 >> 16) & 1; /* Least significant bit of resulting bfloat. */
|
||||
uint32_t lsb = (fp32_data >> 16) & 1; /* Least significant bit of resulting bfloat. */
|
||||
uint32_t rounding_bias = 0x7fff + lsb;
|
||||
|
||||
if ( VSI_NN_FLOAT32_NAN == in )
|
||||
|
|
@ -447,8 +469,8 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
|
|||
}
|
||||
else
|
||||
{
|
||||
fp32 += rounding_bias;
|
||||
out = (uint16_t) (fp32 >> 16);
|
||||
fp32_data += rounding_bias;
|
||||
out = (uint16_t) (fp32_data >> 16);
|
||||
}
|
||||
|
||||
return out;
|
||||
|
|
@ -466,7 +488,9 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
|
|||
|
||||
static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
|
||||
float fp8_f32 = in / scale;
|
||||
int32_t in_val = *((int32_t*)&fp8_f32);
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
fp32_bit_cast.val = fp8_f32;
|
||||
uint32_t in_val = fp32_bit_cast.data;
|
||||
|
||||
uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
|
||||
uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
|
||||
|
|
@ -512,7 +536,9 @@ static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
|
|||
|
||||
static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) {
|
||||
float fp8_f32 = in / scale;
|
||||
int32_t in_val = *((int32_t*)&fp8_f32);
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
fp32_bit_cast.val = fp8_f32;
|
||||
uint32_t in_val = fp32_bit_cast.data;
|
||||
uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
|
||||
uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
|
||||
uint32_t in_man = (in_val & 0x7FFFFF); /* low 23 bits is man */
|
||||
|
|
@ -561,6 +587,7 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
|
|||
uint32_t exponentOut = 0;
|
||||
uint32_t mantissaOut = 0;
|
||||
uint32_t out_u = 0;
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
|
||||
{
|
||||
uint32_t signIn;
|
||||
|
|
@ -610,7 +637,8 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
|
|||
}
|
||||
final:
|
||||
out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
|
||||
val_fp32 = *((float*)&out_u);
|
||||
fp32_bit_cast.data = out_u;
|
||||
val_fp32 = fp32_bit_cast.val;
|
||||
|
||||
return val_fp32 * scale;
|
||||
} /* fp8_e4m3_to_fp32() */
|
||||
|
|
@ -621,6 +649,7 @@ static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
|
|||
uint32_t exponentOut = 0;
|
||||
uint32_t mantissaOut = 0;
|
||||
uint32_t out_u = 0;
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
|
||||
{
|
||||
uint32_t signIn;
|
||||
|
|
@ -670,7 +699,8 @@ static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
|
|||
}
|
||||
final:
|
||||
out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
|
||||
val_fp32 = *((float*)&out_u);
|
||||
fp32_bit_cast.data = out_u;
|
||||
val_fp32 = fp32_bit_cast.val;
|
||||
return val_fp32 * scale;
|
||||
} /* fp8_e5m2_to_fp32() */
|
||||
|
||||
|
|
|
|||
|
|
@ -60,9 +60,7 @@ typedef struct _vsi_nn_hw_config_t
|
|||
{
|
||||
char target_name[VSI_NN_MAX_TARGET_NAME];
|
||||
vsi_nn_hw_evis_t evis;
|
||||
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||
uint32_t subGroupSize;
|
||||
#endif
|
||||
uint32_t use_40bits_va;
|
||||
uint32_t support_stream_processor;
|
||||
uint32_t sp_exec_count;
|
||||
|
|
|
|||
|
|
@ -210,6 +210,8 @@
|
|||
#include "ops/vsi_nn_op_resize_3d.h"
|
||||
#include "ops/vsi_nn_op_reducel2.h"
|
||||
#include "ops/vsi_nn_op_crop_and_resize.h"
|
||||
#include "ops/vsi_nn_op_rmsnorm.h"
|
||||
#include "ops/vsi_nn_op_shape.h"
|
||||
/* custom node head define define */
|
||||
#include "custom/vsi_nn_custom_node_type.h"
|
||||
#include "ops/vsi_nn_op_inverse_sigmoid.h"
|
||||
|
|
@ -408,6 +410,8 @@ typedef union _vsi_nn_nn_param
|
|||
vsi_nn_resize_3d_param resize_3d;
|
||||
vsi_nn_reducel2_param reducel2;
|
||||
vsi_nn_crop_and_resize_param crop_and_resize;
|
||||
vsi_nn_rmsnorm_param rmsnorm;
|
||||
vsi_nn_shape_param shape;
|
||||
void* client_param;
|
||||
|
||||
/* custom node data struct define */
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ extern "C"{
|
|||
|
||||
#define VSI_NN_VERSION_MAJOR 1
|
||||
#define VSI_NN_VERSION_MINOR 2
|
||||
#define VSI_NN_VERSION_PATCH 2
|
||||
#define VSI_NN_VERSION_PATCH 6
|
||||
#define VSI_NN_VERSION \
|
||||
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
|
||||
|
||||
|
|
|
|||
|
|
@ -267,7 +267,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
|
||||
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
|
||||
|
||||
VSI_UNREFERENCED(params);
|
||||
|
|
|
|||
|
|
@ -55,8 +55,17 @@ static vsi_status op_compute
|
|||
vsi_nn_kernel_param_t * param = NULL;
|
||||
vsi_nn_custom_warp_affine_param * p;
|
||||
p = &(self->nn_param.custom_warp_affine);
|
||||
|
||||
float matrix_shape[6] = { 1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f};
|
||||
param = vsi_nn_kernel_param_create();
|
||||
//Unlike OpenCV, we use the coordinate of dst and matrix to calculate the coordinate of src in custom_warp_affine.
|
||||
//Therefore, matrix M_ovx in custom_warp_affine is different from matrix M_cv in OpenCV.
|
||||
//We get M_ovx by transposing the inverse of M_cv.
|
||||
//inv_M = cv2.invertAffineTransform(M_cv); M_ovx=inv_M.transpose(1,0)
|
||||
if (p->matrix == NULL)
|
||||
{
|
||||
p->matrix = matrix_shape;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 );
|
||||
vsi_nn_kernel_param_add_int32( param, "type", p->type);
|
||||
vsi_nn_kernel_param_add_int32( param, "rgb_type", p->rgb_type);
|
||||
|
|
|
|||
|
|
@ -47,18 +47,26 @@ typedef enum
|
|||
} _internal_kernel_e;
|
||||
|
||||
#define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE() "bilinear_grid_sample"
|
||||
#define _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE() "bilinear_grid_sample_reflect"
|
||||
|
||||
#define STR(a) #a
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
|
||||
#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, REFLECT) \
|
||||
((IN1_DTYPE << 24) | (IN0_DTYPE << 16) | (OUT_DTYPE << 8) | (REFLECT))
|
||||
|
||||
#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
{ \
|
||||
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
|
||||
CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
|
||||
_BILINEAR_GRID_SAMPLE_KERNEL_SOURCE() \
|
||||
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
|
||||
CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
|
||||
_BILINEAR_GRID_SAMPLE_KERNEL_SOURCE() \
|
||||
}
|
||||
|
||||
#define PACK_REFLECT_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
{ \
|
||||
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
|
||||
CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_reflect_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
|
||||
_BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE() \
|
||||
}
|
||||
|
||||
typedef struct
|
||||
|
|
@ -73,6 +81,8 @@ static const _kernel_map_type _bilinear_grid_sample_kernel_map[] =
|
|||
// Register kernel here
|
||||
PACK_KERNEL_MAP(F32, F32, F32 ),
|
||||
PACK_KERNEL_MAP(U8, U8, U8),
|
||||
PACK_REFLECT_KERNEL_MAP(F32, F32, F32),
|
||||
PACK_REFLECT_KERNEL_MAP(U8, U8, U8),
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -95,23 +105,24 @@ static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] =
|
|||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
#define _BILINEAR_GRID_SAMPLE_PARAM_NUM 8
|
||||
#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM \
|
||||
#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM 14
|
||||
#define _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_NUM 12
|
||||
#define _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM \
|
||||
_cnt_of_array(_bilinear_grid_sample_kernel_param_def)
|
||||
|
||||
|
||||
#define SCALAR_HALF_INPUT0_W (3)
|
||||
#define SCALAR_HALF_INPUT0_H (4)
|
||||
#define SCALAR_ADD_VALUE_W (5)
|
||||
#define SCALAR_ADD_VALUE_H (6)
|
||||
#define SCALAR_DEPTH (7)
|
||||
#define SCALAR_INPUT0_SCALE (8)
|
||||
#define SCALAR_INPUT0_TAIL (9)
|
||||
#define SCALAR_INPUT1_SCALE (10)
|
||||
#define SCALAR_INPUT1_TAIL (11)
|
||||
#define SCALAR_OUTPUT_SCALE (12)
|
||||
#define SCALAR_OUTPUT_TAIL (13)
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
|
|
@ -170,7 +181,8 @@ static vsi_status _query_kernel
|
|||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_bool* is_use_u8_kernel
|
||||
vsi_bool* is_use_u8_kernel,
|
||||
int32_t is_reflect_mode
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
|
@ -199,14 +211,29 @@ static vsi_status _query_kernel
|
|||
out_dtype = F32;
|
||||
}
|
||||
if ((U8 == in0_dtype) || (U8 == out_dtype)) {
|
||||
param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
|
||||
if (is_reflect_mode)
|
||||
{
|
||||
param_def_size = _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM;
|
||||
}
|
||||
else{
|
||||
param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
|
||||
}
|
||||
|
||||
*is_use_u8_kernel = TRUE;
|
||||
} else {
|
||||
param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
|
||||
if (is_reflect_mode)
|
||||
{
|
||||
param_def_size = _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_NUM;
|
||||
}
|
||||
else
|
||||
{
|
||||
param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
|
||||
}
|
||||
|
||||
*is_use_u8_kernel = FALSE;
|
||||
}
|
||||
|
||||
key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
|
||||
key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, is_reflect_mode);
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
|
|
@ -245,7 +272,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
{
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM];
|
||||
vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM];
|
||||
vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
|
||||
uint32_t final_in1_rank = 0;
|
||||
vsi_nn_tensor_t* rs_tensors = NULL;
|
||||
|
|
@ -263,11 +290,14 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_bool is_use_u8_kernel = FALSE;
|
||||
int32_t align_corners =
|
||||
vsi_nn_kernel_param_get_int32(params, "align_corners");
|
||||
int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
|
||||
uint32_t pad_val = 0;
|
||||
int32_t depth = 0;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
|
||||
float half_input0_w, half_input0_h, add_float_value_w, add_float_value_h;
|
||||
int32_t is_reflect_mode = 0;
|
||||
float min_val_w, span_w, min_val_h, span_h;
|
||||
|
||||
// Check if gpu can support the size
|
||||
if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
|
||||
|
|
@ -280,6 +310,11 @@ static vsi_nn_kernel_node_t _setup
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
|
||||
{
|
||||
is_reflect_mode = 1;
|
||||
}
|
||||
|
||||
final_tensors[0] = inputs[0];
|
||||
|
||||
if (inputs[1]->attr.dim_num >= 3) {
|
||||
|
|
@ -313,12 +348,35 @@ static vsi_nn_kernel_node_t _setup
|
|||
add_float_value_h = half_input0_h - 0.5f;
|
||||
}
|
||||
|
||||
if (is_reflect_mode)
|
||||
{
|
||||
float low_w, low_h, high_w, high_h;
|
||||
if (align_corners)
|
||||
{
|
||||
low_w = 0;
|
||||
low_h = 0;
|
||||
high_w = 2 * (float)(in0_width - 1);
|
||||
high_h = 2 * (float)(in0_height - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
low_w = -1;
|
||||
low_h = -1;
|
||||
high_w = 2 * (float)in0_width - 1;
|
||||
high_h = 2 * (float)in0_height - 1;
|
||||
}
|
||||
min_val_w = low_w / 2;
|
||||
span_w = (high_w - low_w) / 2;
|
||||
min_val_h = low_h / 2;
|
||||
span_h = (high_h - low_h) / 2;
|
||||
}
|
||||
|
||||
depth = (int32_t)inputs[0]->attr.size[2];
|
||||
in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
|
||||
if (U8 == in0_dtype) {
|
||||
pad_val = inputs[0]->attr.dtype.zero_point;
|
||||
}
|
||||
status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel);
|
||||
status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel, is_reflect_mode);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -326,7 +384,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
{
|
||||
size_t node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM,
|
||||
vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM,
|
||||
final_tensors, input_num, &final_tensors[2], output_num );
|
||||
node_params[SCALAR_HALF_INPUT0_W] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_w );
|
||||
node_params[SCALAR_HALF_INPUT0_H] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_h );
|
||||
|
|
@ -335,13 +393,19 @@ static vsi_nn_kernel_node_t _setup
|
|||
node_params[SCALAR_DEPTH] = vsi_nn_kernel_scalar_create( graph, I32, &depth );
|
||||
if (is_use_u8_kernel)
|
||||
{
|
||||
node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
|
||||
node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail );
|
||||
node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
|
||||
node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail );
|
||||
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
|
||||
node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
|
||||
node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
|
||||
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
|
||||
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail );
|
||||
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
|
||||
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail );
|
||||
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
|
||||
node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
|
||||
}
|
||||
if (is_reflect_mode)
|
||||
{
|
||||
node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &min_val_w);
|
||||
node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &span_w);
|
||||
node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &min_val_h);
|
||||
node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &span_h);
|
||||
}
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
|
||||
|
|
@ -351,19 +415,34 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_W]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_H]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[SCALAR_DEPTH]);
|
||||
node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
|
||||
if (is_use_u8_kernel) {
|
||||
vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_SCALE]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_TAIL]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_SCALE]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_TAIL]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_SCALE]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_TAIL]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
|
||||
}
|
||||
if (is_reflect_mode)
|
||||
{
|
||||
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
|
||||
vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
|
||||
}
|
||||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U32 = pad_val;
|
||||
if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
|
||||
{
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U32 = pad_val;
|
||||
}
|
||||
else
|
||||
{
|
||||
border.mode = VX_BORDER_REPLICATE;
|
||||
}
|
||||
status = vxSetNodeAttribute(
|
||||
(vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
|
||||
CHECK_STATUS(status);
|
||||
|
|
|
|||
|
|
@ -244,7 +244,10 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
|
|||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(input_attr0);
|
||||
SAFE_FREE_TENSOR_ATTR(input_attr1);
|
||||
SAFE_FREE_TENSOR_ATTR(output_attr);
|
||||
|
||||
return status;
|
||||
} /* _gather_elements_initializer() */
|
||||
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util_prv.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
|
@ -1489,8 +1490,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
float twoLogE = 2 * logE;
|
||||
uint32_t uint_min = 0xFBFFFFFF;
|
||||
uint32_t uint_max = 0x7BFFFFFF;
|
||||
float float_min = *(vx_float32 *)&uint_min;
|
||||
float float_max = *(vx_float32 *)&uint_max;
|
||||
float float_min = 0.0f;
|
||||
float float_max = 0.0f;
|
||||
float scale_val[9] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
|
||||
float tail_val[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
vsi_bool is_u8_type = FALSE;
|
||||
|
|
@ -1499,6 +1500,12 @@ static vsi_nn_kernel_node_t _setup
|
|||
size_t lstm_activation_in_out_num = 0;
|
||||
uint32_t i;
|
||||
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
fp32_bit_cast.data = uint_min;
|
||||
float_min = fp32_bit_cast.val;
|
||||
fp32_bit_cast.data = uint_max;
|
||||
float_max = fp32_bit_cast.val;
|
||||
|
||||
_is_ln = vsi_nn_kernel_param_get_int32( params, "_is_ln" );
|
||||
_is_cifg = vsi_nn_kernel_param_get_int32( params, "_is_cifg" );
|
||||
_is_proj = vsi_nn_kernel_param_get_int32( params, "_is_proj" );
|
||||
|
|
|
|||
|
|
@ -266,6 +266,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_bool is_use_u8_kernel = FALSE;
|
||||
int32_t align_corners =
|
||||
vsi_nn_kernel_param_get_int32(params, "align_corners");
|
||||
int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
|
||||
uint32_t pad_val = 0;
|
||||
int32_t depth = 0;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
|
|
@ -282,6 +283,11 @@ static vsi_nn_kernel_node_t _setup
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
final_tensors[0] = inputs[0];
|
||||
if (inputs[1]->attr.dim_num >= 3) {
|
||||
final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
|
||||
|
|
@ -382,8 +388,15 @@ static vsi_nn_kernel_node_t _setup
|
|||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U32 = pad_val;
|
||||
if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
|
||||
{
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U32 = pad_val;
|
||||
}
|
||||
else
|
||||
{
|
||||
border.mode = VX_BORDER_REPLICATE;
|
||||
}
|
||||
status = vxSetNodeAttribute(
|
||||
(vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
|
||||
CHECK_STATUS(status);
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@
|
|||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "utils/vsi_nn_dtype_util_prv.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -242,6 +243,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_size_t suffix_dim_size = 0;
|
||||
int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
uint32_t data[2] = {0};
|
||||
float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
|
||||
float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
|
||||
|
|
@ -258,8 +260,11 @@ static vsi_nn_kernel_node_t _setup
|
|||
}
|
||||
else
|
||||
{
|
||||
data[0] = *(uint32_t*)&on_value;
|
||||
data[1] = *(uint32_t*)&off_value;
|
||||
fp32_bit_cast.val = on_value;
|
||||
data[0] = fp32_bit_cast.data;
|
||||
|
||||
fp32_bit_cast.val = off_value;
|
||||
data[1] = fp32_bit_cast.data;
|
||||
}
|
||||
|
||||
axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis;
|
||||
|
|
|
|||
|
|
@ -34,20 +34,24 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define _TOPK_KERNEL_SOURCE "topk"
|
||||
#define STR(a) #a
|
||||
// Add kernel hashtable here
|
||||
#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
|
||||
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
|
||||
#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \
|
||||
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26))
|
||||
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
|
||||
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
|
||||
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \
|
||||
CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
|
||||
_TOPK_KERNEL_SOURCE }
|
||||
|
||||
#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \
|
||||
CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
|
||||
"topk2" }
|
||||
|
||||
#define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
|
||||
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
|
||||
#define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
|
|
@ -111,6 +115,9 @@ static const _kernel_map_type _topk_kernel_map[] =
|
|||
PACK_KERNEL_MAP( F32, I32, 4 ),
|
||||
PACK_KERNEL_MAP( F32, I32, 5 ),
|
||||
PACK_KERNEL_MAP( F32, I32, 6 ),
|
||||
|
||||
PACK_MERGE_KERNEL_MAP(U32, U32),
|
||||
PACK_MERGE_KERNEL_MAP(I32, I32),
|
||||
};
|
||||
|
||||
static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
|
||||
|
|
@ -254,7 +261,8 @@ static vsi_status _query_kernel
|
|||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t num_stages
|
||||
int32_t num_stages,
|
||||
vsi_bool is_bitnoic_segment
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
|
@ -272,21 +280,23 @@ static vsi_status _query_kernel
|
|||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
num_stages = is_bitnoic_segment ? 0 : num_stages;
|
||||
|
||||
switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
|
||||
{
|
||||
case _PACK_SELECT_KEY(F32, F32):
|
||||
case _PACK_SELECT_KEY(F16, F16):
|
||||
key = TOPK_HASH_KEY( F32, F32, num_stages );
|
||||
key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment );
|
||||
break;
|
||||
case _PACK_SELECT_KEY(U32, U32):
|
||||
case _PACK_SELECT_KEY(U16, U16):
|
||||
case _PACK_SELECT_KEY(U8, U8):
|
||||
key = TOPK_HASH_KEY( U32, U32, num_stages );
|
||||
key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment );
|
||||
break;
|
||||
case _PACK_SELECT_KEY(I32, I32):
|
||||
case _PACK_SELECT_KEY(I16, I16):
|
||||
case _PACK_SELECT_KEY(I8, I8):
|
||||
key = TOPK_HASH_KEY( I32, I32, num_stages );
|
||||
key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment );
|
||||
break;
|
||||
case _PACK_SELECT_KEY(F32, U32):
|
||||
case _PACK_SELECT_KEY(F16, U32):
|
||||
|
|
@ -294,7 +304,7 @@ static vsi_status _query_kernel
|
|||
case _PACK_SELECT_KEY(F16, U16):
|
||||
case _PACK_SELECT_KEY(F32, U8):
|
||||
case _PACK_SELECT_KEY(F16, U8):
|
||||
key = TOPK_HASH_KEY( F32, U32, num_stages );
|
||||
key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment );
|
||||
break;
|
||||
case _PACK_SELECT_KEY(F32, I32):
|
||||
case _PACK_SELECT_KEY(F16, I32):
|
||||
|
|
@ -302,7 +312,7 @@ static vsi_status _query_kernel
|
|||
case _PACK_SELECT_KEY(F16, I16):
|
||||
case _PACK_SELECT_KEY(F32, I8):
|
||||
case _PACK_SELECT_KEY(F16, I8):
|
||||
key = TOPK_HASH_KEY( F32, I32, num_stages );
|
||||
key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment );
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
|
@ -440,7 +450,12 @@ static vsi_nn_kernel_node_t _setup
|
|||
int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
|
||||
int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
|
||||
vsi_bool is_odd_even_sort = FALSE;
|
||||
vsi_bool is_bitnoic_segment = FALSE;
|
||||
size_t param_num = _TOPK_PARAM_NUM;
|
||||
int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
|
||||
vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
|
|
@ -471,9 +486,14 @@ static vsi_nn_kernel_node_t _setup
|
|||
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shape[0], 2 );
|
||||
|
||||
if (num_stages < 7)
|
||||
is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) &&
|
||||
type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32);
|
||||
num_stages = is_bitnoic_segment ? 9 : num_stages;
|
||||
max_stages = is_bitnoic_segment ? max_stages : 7;
|
||||
|
||||
if (num_stages < max_stages || is_bitnoic_segment)
|
||||
{
|
||||
status = _query_kernel( kernel, inputs, outputs, num_stages );
|
||||
status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment );
|
||||
|
||||
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shape[1], 2 );
|
||||
|
|
|
|||
|
|
@ -52,15 +52,26 @@ typedef enum
|
|||
#define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(_input_type, _output_type) \
|
||||
"bilinear_grid_sample_" #_input_type "_to_" #_output_type
|
||||
|
||||
#define _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE(_input_type, _output_type) \
|
||||
"bilinear_grid_sample_reflect_" #_input_type "_to_" #_output_type
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
|
||||
#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, REFLECT) \
|
||||
((IN1_DTYPE << 24) | (IN0_DTYPE << 16) | (OUT_DTYPE << 8) | (REFLECT))
|
||||
|
||||
#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
{ \
|
||||
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
|
||||
CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
|
||||
_BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \
|
||||
}
|
||||
{ \
|
||||
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
|
||||
CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
|
||||
_BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \
|
||||
}
|
||||
|
||||
#define PACK_REFLECT_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
{ \
|
||||
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
|
||||
CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_reflect_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
|
||||
_BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \
|
||||
}
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
|
@ -83,6 +94,18 @@ static const _kernel_map_type _bilinear_grid_sample_kernel_map[] =
|
|||
PACK_KERNEL_MAP(I16, I16, I16),
|
||||
PACK_KERNEL_MAP(I8, I8, I8),
|
||||
PACK_KERNEL_MAP(BF16, BF16, BF16),
|
||||
PACK_REFLECT_KERNEL_MAP(F16, F32, F16),
|
||||
PACK_REFLECT_KERNEL_MAP(F16, U8, F16),
|
||||
PACK_REFLECT_KERNEL_MAP(F16, F16, F16),
|
||||
PACK_REFLECT_KERNEL_MAP(F16, F32, U8),
|
||||
PACK_REFLECT_KERNEL_MAP(F16, F16, U8),
|
||||
PACK_REFLECT_KERNEL_MAP(F16, U8, U8),
|
||||
PACK_REFLECT_KERNEL_MAP(U8, U8, U8),
|
||||
PACK_REFLECT_KERNEL_MAP(U8, F16, U8),
|
||||
PACK_REFLECT_KERNEL_MAP(U8, F32, U8),
|
||||
PACK_REFLECT_KERNEL_MAP(I16, I16, I16),
|
||||
PACK_REFLECT_KERNEL_MAP(I8, I8, I8),
|
||||
PACK_REFLECT_KERNEL_MAP(BF16, BF16, BF16),
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -96,18 +119,20 @@ static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] =
|
|||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _BILINEAR_GRID_SAMPLE_PARAM_NUM _cnt_of_array( _bilinear_grid_sample_kernel_param_def )
|
||||
#define _BILINEAR_GRID_SAMPLE_PARAM_NUM \
|
||||
_cnt_of_array( _bilinear_grid_sample_kernel_param_def )
|
||||
|
||||
#define SCALAR_ALIGN_CORNERS (3)
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
|
||||
static vsi_status _bilinear_grid_sample_initializer_base
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
size_t param_size,
|
||||
vsi_bool is_reflect_mode
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
|
@ -135,6 +160,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
|
|||
int32_t input1ZP = 0;
|
||||
float output_scale = 1.0;
|
||||
int32_t outputZP = 0;
|
||||
float min_val_wh[4] = { 0 };
|
||||
float span_wh[4] = { 0 };
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
|
|
@ -156,6 +183,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
|
|||
(vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
|
||||
out_shape = output_attr->shape;
|
||||
in0_shape = input_attr[0]->shape;
|
||||
input0_dtype = input_attr[0]->dtype;
|
||||
|
|
@ -193,6 +221,35 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "add_float_value", add_float_value);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "depth", &depth);
|
||||
|
||||
if (is_reflect_mode)
|
||||
{
|
||||
float low_w, low_h, high_w, high_h;
|
||||
if (align_corners)
|
||||
{
|
||||
low_w = 0;
|
||||
low_h = 0;
|
||||
high_w = 2 * (float)(in0_width - 1);
|
||||
high_h = 2 * (float)(in0_height - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
low_w = -1;
|
||||
low_h = -1;
|
||||
high_w = 2 * (float)in0_width - 1;
|
||||
high_h = 2 * (float)in0_height - 1;
|
||||
}
|
||||
min_val_wh[0] = low_w / 2;
|
||||
span_wh[0] = (high_w - low_w) / 2;
|
||||
min_val_wh[1] = low_h / 2;
|
||||
span_wh[1] = (high_h - low_h) / 2;
|
||||
min_val_wh[2] = min_val_wh[0];
|
||||
min_val_wh[3] = min_val_wh[1];
|
||||
span_wh[2] = span_wh[0];
|
||||
span_wh[3] = span_wh[1];
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "span_wh", span_wh);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "min_val_wh", min_val_wh);
|
||||
}
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniFp16toFp32_part0_4x4 = {
|
||||
{
|
||||
|
|
@ -538,6 +595,28 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
|
|||
|
||||
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t* param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
return _bilinear_grid_sample_initializer_base(
|
||||
node, param, param_size, vx_false_e);
|
||||
}
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_reflect_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t* param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
return _bilinear_grid_sample_initializer_base(
|
||||
node, param, param_size, vx_true_e);
|
||||
}
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
|
|
@ -545,7 +624,8 @@ static vsi_status _query_kernel
|
|||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t is_reflect_mode
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
|
@ -563,7 +643,16 @@ static vsi_status _query_kernel
|
|||
in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
|
||||
key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, is_reflect_mode);
|
||||
|
||||
if (is_reflect_mode)
|
||||
{
|
||||
initializer = _bilinear_grid_sample_reflect_initializer;
|
||||
}
|
||||
else
|
||||
{
|
||||
initializer = _bilinear_grid_sample_initializer;
|
||||
}
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
|
|
@ -605,13 +694,21 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
|
||||
uint32_t final_in1_rank = 0;
|
||||
vsi_size_t final_out_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
|
||||
uint32_t final_in1_rank = 0, final_out_rank = 0;
|
||||
vsi_nn_tensor_t* rs_tensors = NULL;
|
||||
vsi_nn_tensor_t* rs_out_tensors = NULL;
|
||||
vsi_nn_tensor_t* final_tensors[3] = {NULL};
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
uint32_t pad_val = 0;
|
||||
int32_t align_corners =
|
||||
vsi_nn_kernel_param_get_int32(params, "align_corners");
|
||||
int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
|
||||
int32_t is_reflect_mode = 0;
|
||||
vsi_size_t in_size_x = inputs[1]->attr.size[1];
|
||||
vsi_size_t in_size_y = inputs[1]->attr.dim_num >= 3 ? inputs[1]->attr.size[2] : 1;
|
||||
vsi_size_t new_size_x = in_size_x, new_size_y = in_size_y;
|
||||
vsi_bool is_reshape_out = vx_false_e;
|
||||
|
||||
// Check if gpu can support the size
|
||||
if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
|
||||
|
|
@ -624,12 +721,63 @@ static vsi_nn_kernel_node_t _setup
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
|
||||
{
|
||||
is_reflect_mode = 1;
|
||||
}
|
||||
|
||||
final_tensors[0] = inputs[0];
|
||||
|
||||
is_reshape_out = vx_false_e;
|
||||
if (inputs[1]->attr.dim_num >= 3) {
|
||||
vsi_size_t shape_x[2];
|
||||
vsi_size_t out_shape_x[2];
|
||||
vsi_size_t out_rank_x;
|
||||
shape_x[0] = in_size_x;
|
||||
shape_x[1] = in_size_y;
|
||||
vsi_nn_kernel_optimize_element_shape(shape_x, 2, out_shape_x, &out_rank_x);
|
||||
if (out_rank_x == 2)
|
||||
{
|
||||
new_size_x = out_shape_x[0];
|
||||
new_size_y = out_shape_x[1];
|
||||
}
|
||||
|
||||
final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
|
||||
final_shape[1] = inputs[1]->attr.size[2];
|
||||
if ((new_size_x == in_size_x) && (new_size_y == in_size_y))
|
||||
{
|
||||
is_reshape_out = vx_false_e;
|
||||
}
|
||||
else if ((new_size_x * 2) >= GPU_TENSOR_MAX_WIDTH)
|
||||
{
|
||||
is_reshape_out = vx_false_e;
|
||||
}
|
||||
else
|
||||
{
|
||||
is_reshape_out = vx_true_e;
|
||||
}
|
||||
|
||||
if (is_reshape_out == vx_false_e)
|
||||
{
|
||||
new_size_x = in_size_x;
|
||||
new_size_y = in_size_y;
|
||||
if ((new_size_x < new_size_y) && ((new_size_y * 2) < GPU_TENSOR_MAX_WIDTH))
|
||||
{
|
||||
vsi_size_t tmp = new_size_x;
|
||||
new_size_x = new_size_y;
|
||||
new_size_y = tmp;
|
||||
is_reshape_out = vx_true_e;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (((new_size_x * 2) >= GPU_TENSOR_MAX_WIDTH) || (new_size_y >= GPU_TENSOR_MAX_WIDTH))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (inputs[1]->attr.dim_num >= 3) {
|
||||
final_shape[0] = new_size_x * inputs[1]->attr.size[0];
|
||||
final_shape[1] = new_size_y;
|
||||
final_shape[2] = 1;
|
||||
final_shape[3] = inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1;
|
||||
final_in1_rank =
|
||||
|
|
@ -643,14 +791,32 @@ static vsi_nn_kernel_node_t _setup
|
|||
} else {
|
||||
final_tensors[1] = inputs[1];
|
||||
}
|
||||
final_tensors[2] = outputs[0];
|
||||
|
||||
if (is_reshape_out)
|
||||
{
|
||||
final_out_shape[0] = new_size_x;
|
||||
final_out_shape[1] = new_size_y;
|
||||
final_out_shape[2] = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1;
|
||||
final_out_shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
|
||||
final_out_rank = outputs[0]->attr.dim_num;
|
||||
if (!vsi_nn_kernel_gpu_check_shape(final_out_shape, final_out_rank)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
rs_out_tensors = vsi_nn_reshape_tensor(graph, outputs[0], final_out_shape, final_out_rank);
|
||||
final_tensors[2] = rs_out_tensors;
|
||||
}
|
||||
else
|
||||
{
|
||||
final_tensors[2] = outputs[0];
|
||||
}
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
|
||||
if (U8 == in0_dtype) {
|
||||
pad_val = inputs[0]->attr.dtype.zero_point;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
status = _query_kernel( kernel, inputs, outputs, is_reflect_mode);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -662,14 +828,22 @@ static vsi_nn_kernel_node_t _setup
|
|||
node_params[SCALAR_ALIGN_CORNERS] =
|
||||
vsi_nn_kernel_scalar_create(graph, I32, &align_corners);
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM );
|
||||
status = vsi_nn_kernel_node_pass_param(
|
||||
node, node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM );
|
||||
VSI_ASSERT(status == VSI_SUCCESS);
|
||||
vsi_nn_kernel_scalar_release(&node_params[SCALAR_ALIGN_CORNERS]);
|
||||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U32 = pad_val;
|
||||
if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
|
||||
{
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U32 = pad_val;
|
||||
}
|
||||
else
|
||||
{
|
||||
border.mode = VX_BORDER_REPLICATE;
|
||||
}
|
||||
status = vxSetNodeAttribute(
|
||||
(vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
|
||||
CHECK_STATUS(status);
|
||||
|
|
@ -678,6 +852,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
}
|
||||
|
||||
vsi_safe_release_tensor(rs_tensors);
|
||||
vsi_safe_release_tensor(rs_out_tensors);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
|
|
|||
|
|
@ -513,6 +513,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
}
|
||||
|
||||
if ( node )
|
||||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util_prv.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
|
|
@ -1002,8 +1003,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
|
|||
float twoLogE = 2 * logE;
|
||||
uint32_t uint_min = 0xFBFFFFFF;
|
||||
uint32_t uint_max = 0x7BFFFFFF;
|
||||
float float_min = *(float *)&uint_min;
|
||||
float float_max = *(float *)&uint_max;
|
||||
float float_min = 0.0f;
|
||||
float float_max = 0.0f;
|
||||
float clip_Min_F[4] = {0};
|
||||
float clip_Max_F[4] = {0};
|
||||
uint32_t i = 0;
|
||||
|
|
@ -1017,6 +1018,12 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
|
|||
vsi_nn_kernel_tensor_attr_t* input_attr[9] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL};
|
||||
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
fp32_bit_cast.data = uint_min;
|
||||
float_min = fp32_bit_cast.val;
|
||||
fp32_bit_cast.data = uint_max;
|
||||
float_max = fp32_bit_cast.val;
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 5], &_is_ln );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 4], &_is_cifg );
|
||||
|
|
|
|||
|
|
@ -539,6 +539,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
uint32_t pad_val = 0;
|
||||
int32_t align_corners =
|
||||
vsi_nn_kernel_param_get_int32(params, "align_corners");
|
||||
int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
|
||||
|
||||
// Check if gpu can support the size
|
||||
if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
|
||||
|
|
@ -551,6 +552,11 @@ static vsi_nn_kernel_node_t _setup
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
final_tensors[0] = inputs[0];
|
||||
|
||||
if (inputs[1]->attr.dim_num >= 3) {
|
||||
|
|
@ -596,8 +602,15 @@ static vsi_nn_kernel_node_t _setup
|
|||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U32 = pad_val;
|
||||
if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
|
||||
{
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U32 = pad_val;
|
||||
}
|
||||
else
|
||||
{
|
||||
border.mode = VX_BORDER_REPLICATE;
|
||||
}
|
||||
status = vxSetNodeAttribute(
|
||||
(vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
|
||||
CHECK_STATUS(status);
|
||||
|
|
|
|||
|
|
@ -754,7 +754,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
goto final;
|
||||
}
|
||||
|
||||
if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&
|
||||
|
|
|
|||
|
|
@ -915,7 +915,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
goto final;
|
||||
}
|
||||
|
||||
if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&
|
||||
|
|
|
|||
|
|
@ -290,13 +290,14 @@ static vsi_nn_tensor_t* _create_scale_tensor
|
|||
vsi_nn_tensor_t* scale = NULL;
|
||||
vsi_size_t i = 0;
|
||||
float *scale_data_ptr = NULL;
|
||||
int *index_data_ptr = NULL;
|
||||
int32_t *index_data_ptr = NULL;
|
||||
float scale_value = 0;
|
||||
vsi_ssize_t data = 0;
|
||||
int idx = 0;
|
||||
int32_t idx = 0;
|
||||
float delta_v = 0;
|
||||
float cubic_coeff_a = -0.5f;
|
||||
vsi_size_t item_count = 4 * output_size;
|
||||
|
||||
scale_data_ptr = (float *)malloc(item_count * sizeof(float));
|
||||
if (scale_data_ptr == NULL)
|
||||
{
|
||||
|
|
@ -316,7 +317,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
|
|||
scale_value = ((float)i + half_pixel_value) * scale_factor - half_pixel_value;
|
||||
data = (vsi_ssize_t)scale_value;
|
||||
delta_v = scale_value - (float)data;
|
||||
idx = (int)data - 1;
|
||||
idx = (int32_t)data - 1;
|
||||
|
||||
index_data_ptr[i] = idx;
|
||||
scale_data_ptr[i * 4 + 0] = cubic_coeff_a * (((delta_v - 4) * (delta_v + 1) + 8) * (delta_v + 1) - 4);
|
||||
|
|
@ -331,11 +332,6 @@ static vsi_nn_tensor_t* _create_scale_tensor
|
|||
attr.vtl = FALSE;
|
||||
|
||||
scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr);
|
||||
if (scale_data_ptr)
|
||||
{
|
||||
free (scale_data_ptr);
|
||||
scale_data_ptr = NULL;
|
||||
}
|
||||
|
||||
attr.size[0] = output_size;
|
||||
attr.dim_num = 1;
|
||||
|
|
@ -343,13 +339,11 @@ static vsi_nn_tensor_t* _create_scale_tensor
|
|||
attr.vtl = FALSE;
|
||||
|
||||
*index = vsi_nn_CreateTensorFromData(graph, (uint8_t *)index_data_ptr, &attr);
|
||||
if (index_data_ptr)
|
||||
{
|
||||
free (index_data_ptr);
|
||||
index_data_ptr = NULL;
|
||||
}
|
||||
|
||||
OnError:
|
||||
vsi_nn_safe_free(scale_data_ptr);
|
||||
vsi_nn_safe_free(index_data_ptr);
|
||||
|
||||
return scale;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1218,6 +1218,11 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
|
|||
status = backend->select( graph, inputs, input_num, outputs, output_num,
|
||||
params, &selector );
|
||||
VSI_ASSERT( status == VSI_SUCCESS );
|
||||
|
||||
if ( status != VSI_SUCCESS ) {
|
||||
VSILOGW("Failed to select kernel \"%s\"", kernel_name);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
|||
|
|
@ -171,7 +171,9 @@ vsi_status vsi_nn_kernel_copy_tensor_patch
|
|||
|
||||
vsi_nn_kernel_tensor_attr_get_stride( attr, stride );
|
||||
memset(start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
|
||||
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
|
||||
memset(end, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
|
||||
|
||||
for (i = 0; i < (uint32_t)attr->shape->size; i++)
|
||||
{
|
||||
end[i] = attr->shape->data[i];
|
||||
if ( attr->dtype != I4 && attr->dtype != U4 )
|
||||
|
|
@ -490,7 +492,7 @@ vsi_status vsi_nn_kernel_scalar_get_dtype
|
|||
( vsi_nn_kernel_scalar_t scalar, DTYPE * ptr ) \
|
||||
{ \
|
||||
vsi_status status; \
|
||||
vsi_nn_kernel_dtype_e dtype; \
|
||||
vsi_nn_kernel_dtype_e dtype = INVALID_DTYPE; \
|
||||
if( !ptr ) \
|
||||
{ \
|
||||
VSILOGE("Pointer to store scalar is null"); \
|
||||
|
|
|
|||
|
|
@ -0,0 +1,94 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2021 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
|
||||
#define REGISTER_RMS_NORM_OPENVX_KERNEL( kernel_name ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
); \
|
||||
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
)
|
||||
|
||||
REGISTER_RMS_NORM_OPENVX_KERNEL(rms_norm)
|
||||
{
|
||||
vx_node node = NULL;
|
||||
|
||||
#if (VX_RMS_NORM_VX_SUPPORT)
|
||||
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
vx_tensor inputs_tensor[2] = {NULL};
|
||||
vx_tensor output_tensor = NULL;
|
||||
|
||||
VSI_UNREFERENCED(kernel);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
inputs_tensor[0] = inputs[0]->t;
|
||||
inputs_tensor[1] = inputs[1]->t;
|
||||
output_tensor = outputs[0]->t;
|
||||
|
||||
node = vxRMSNormalizationLayer(
|
||||
graph->g,
|
||||
eps,
|
||||
axis,
|
||||
inputs_tensor,
|
||||
(uint32_t)input_num,
|
||||
output_tensor
|
||||
);
|
||||
#else
|
||||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(graph);
|
||||
VSI_UNREFERENCED(inputs);
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(outputs);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(params);
|
||||
VSI_UNREFERENCED(kernel);
|
||||
#endif
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* rms_norm() */
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
__kernel void bilinear_grid_sample_reflect_F32_F32toF32(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
float half_input0_w,
|
||||
float half_input0_h,
|
||||
float add_float_value_w,
|
||||
float add_float_value_h,
|
||||
int depth,
|
||||
float min_val_w,
|
||||
float span_w,
|
||||
float min_val_h,
|
||||
float span_h
|
||||
)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));
|
||||
int2 coord_add = (int2)(-1, 1);
|
||||
|
||||
float fx = read_imagef(input1, coord_in1).x;
|
||||
coord_in1.x = coord_in1.x + 1;
|
||||
float fy = read_imagef(input1, coord_in1).x;
|
||||
|
||||
fx = fx * half_input0_w + add_float_value_w;
|
||||
fy = fy * half_input0_h + add_float_value_h;
|
||||
|
||||
if (span_w > 0)
|
||||
{
|
||||
fx = fabs(fx - min_val_w);
|
||||
int flips_x = (int)(fx / span_w);
|
||||
float extra_x = fx - flips_x * span_w;
|
||||
fx = (flips_x & 0x01) ? min_val_w + (span_w - extra_x) : min_val_w + extra_x ;
|
||||
}
|
||||
else
|
||||
{
|
||||
fx = 0;
|
||||
}
|
||||
|
||||
if (span_h > 0)
|
||||
{
|
||||
fy = fabs(fy - min_val_h);
|
||||
int flips_y = (int)(fy / span_h);
|
||||
float extra_y = fy - flips_y * span_h;
|
||||
fy = (flips_y & 0x01) ? min_val_h + (span_h - extra_y) : min_val_h + extra_y ;
|
||||
}
|
||||
else
|
||||
{
|
||||
fy = 0;
|
||||
}
|
||||
|
||||
float x_f = floor(fx);
|
||||
float y_f = floor(fy);
|
||||
float x_lerp = fx - x_f;
|
||||
float y_lerp = fy - y_f;
|
||||
int x_index = convert_int(x_f);
|
||||
int y_index = convert_int(y_f);
|
||||
int4 coord_in = (int4)(x_index, y_index, 0, 0);
|
||||
|
||||
float4 top_l, top_r, bottom_l, bottom_r, top, bottom, dst;
|
||||
|
||||
while (coord_in.z < depth){
|
||||
top_l = read_imagef(input0, coord_in);
|
||||
coord_in.y++;
|
||||
bottom_l = read_imagef(input0, coord_in);
|
||||
coord_in.x++;
|
||||
bottom_r = read_imagef(input0, coord_in);
|
||||
coord_in.y--;
|
||||
top_r = read_imagef(input0, coord_in);
|
||||
top_r = top_r - top_l;
|
||||
top = top_l + x_lerp * top_r;
|
||||
bottom_r = bottom_r - bottom_l;
|
||||
bottom = bottom_l + x_lerp * bottom_r;
|
||||
bottom = bottom - top;
|
||||
dst = top + y_lerp * bottom;
|
||||
write_imagef(output, coord_out, dst);
|
||||
coord_in.xz = coord_in.xz + coord_add;
|
||||
coord_out.z++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_U8_U8toU8(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
float half_input0_w,
|
||||
float half_input0_h,
|
||||
float add_float_value_w,
|
||||
float add_float_value_h,
|
||||
int depth,
|
||||
float in0_scale,
|
||||
float in0_tail,
|
||||
float in1_scale,
|
||||
float in1_tail,
|
||||
float out_scale,
|
||||
float out_tail,
|
||||
float min_val_w,
|
||||
float span_w,
|
||||
float min_val_h,
|
||||
float span_h
|
||||
)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));
|
||||
int2 coord_add = (int2)(-1, 1);
|
||||
|
||||
float fx = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
|
||||
coord_in1.x = coord_in1.x + 1;
|
||||
float fy = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
|
||||
|
||||
fx = fx * half_input0_w + add_float_value_w;
|
||||
fy = fy * half_input0_h + add_float_value_h;
|
||||
|
||||
if (span_w > 0)
|
||||
{
|
||||
fx = fabs(fx - min_val_w);
|
||||
int flips_x = (int)(fx / span_w);
|
||||
float extra_x = fx - flips_x * span_w;
|
||||
fx = (flips_x & 0x01) ? min_val_w + (span_w - extra_x) : min_val_w + extra_x ;
|
||||
}
|
||||
else
|
||||
{
|
||||
fx = 0;
|
||||
}
|
||||
|
||||
if (span_h > 0)
|
||||
{
|
||||
fy = fabs(fy - min_val_h);
|
||||
int flips_y = (int)(fy / span_h);
|
||||
float extra_y = fy - flips_y * span_h;
|
||||
fy = (flips_y & 0x01) ? min_val_h + (span_h - extra_y) : min_val_h + extra_y ;
|
||||
}
|
||||
else
|
||||
{
|
||||
fy = 0;
|
||||
}
|
||||
|
||||
float x_f = floor(fx);
|
||||
float y_f = floor(fy);
|
||||
float x_lerp = fx - x_f;
|
||||
float y_lerp = fy - y_f;
|
||||
int x_index = convert_int(x_f);
|
||||
int y_index = convert_int(y_f);
|
||||
int4 coord_in = (int4)(x_index, y_index, 0, 0);
|
||||
|
||||
float4 top_l, top_r, bottom_l, bottom_r, top, bottom;
|
||||
uint4 dst;
|
||||
|
||||
while (coord_in.z < depth){
|
||||
top_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
|
||||
coord_in.y++;
|
||||
bottom_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
|
||||
coord_in.x++;
|
||||
bottom_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
|
||||
coord_in.y--;
|
||||
top_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
|
||||
top_r = top_r - top_l;
|
||||
top = top_l + x_lerp * top_r;
|
||||
bottom_r = bottom_r - bottom_l;
|
||||
bottom = bottom_l + x_lerp * bottom_r;
|
||||
bottom = bottom - top;
|
||||
top = top + y_lerp * bottom;
|
||||
dst = convert_uint4_rte(top * out_scale + out_tail);
|
||||
write_imageui(output, coord_out, dst);
|
||||
coord_in.xz = coord_in.xz + coord_add;
|
||||
coord_out.z++;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
#define VSI_FLOAT32_MIN (1.175494351e-38F)
|
||||
#define VSI_FLOAT32_MIN (-3.40E+38)
|
||||
|
||||
#define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val, read_func, write_func, conv_func) \
|
||||
__kernel void maxpool_##in_name##to##out_name( \
|
||||
|
|
@ -45,7 +45,7 @@ __kernel void maxpool_##in_name##to##out_name( \
|
|||
{ \
|
||||
src0 = read_func(input, coord_in); \
|
||||
coord_in.x += dilation_x; \
|
||||
maxVal = max(src0, maxVal); \
|
||||
maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
|
|
@ -101,7 +101,7 @@ __kernel void maxpool_F32toF32(
|
|||
{
|
||||
src0 = read_imagef(input, coord_in);
|
||||
coord_in.x += dilation_x;
|
||||
maxVal = max(src0, maxVal);
|
||||
maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -152,7 +152,7 @@ __kernel void maxpool_U32toF32(
|
|||
{
|
||||
src0 = read_imageui(input, coord_in);
|
||||
coord_in.x += dilation_x;
|
||||
maxVal = max(src0, maxVal);
|
||||
maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -206,7 +206,7 @@ __kernel void maxpool_F32toU32(
|
|||
{
|
||||
src0 = read_imagef(input, coord_in);
|
||||
coord_in.x += dilation_x;
|
||||
maxVal = max(src0, maxVal);
|
||||
maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
|
|||
float left_elem = local_data[left_id]; \
|
||||
float right_elem = local_data[right_id]; \
|
||||
\
|
||||
if ((left_elem < right_elem) ^ signo) \
|
||||
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
|
||||
{ \
|
||||
local_data[left_id] = right_elem; \
|
||||
local_data[right_id] = left_elem; \
|
||||
|
|
@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
|
|||
uint left_elem = local_data[left_id]; \
|
||||
uint right_elem = local_data[right_id]; \
|
||||
\
|
||||
if ((left_elem < right_elem) ^ signo) \
|
||||
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
|
||||
{ \
|
||||
local_data[left_id] = right_elem; \
|
||||
local_data[right_id] = left_elem; \
|
||||
|
|
@ -227,7 +227,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
|
|||
int left_elem = local_data[left_id]; \
|
||||
int right_elem = local_data[right_id]; \
|
||||
\
|
||||
if ((left_elem < right_elem) ^ signo) \
|
||||
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
|
||||
{ \
|
||||
local_data[left_id] = right_elem; \
|
||||
local_data[right_id] = left_elem; \
|
||||
|
|
@ -315,7 +315,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
|
|||
float left_elem = local_data[left_id]; \
|
||||
float right_elem = local_data[right_id]; \
|
||||
\
|
||||
if ((left_elem < right_elem) ^ signo) \
|
||||
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
|
||||
{ \
|
||||
local_data[left_id] = right_elem; \
|
||||
local_data[right_id] = left_elem; \
|
||||
|
|
@ -403,7 +403,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
|
|||
float left_elem = local_data[left_id]; \
|
||||
float right_elem = local_data[right_id]; \
|
||||
\
|
||||
if ((left_elem < right_elem) ^ signo) \
|
||||
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
|
||||
{ \
|
||||
local_data[left_id] = right_elem; \
|
||||
local_data[right_id] = left_elem; \
|
||||
|
|
|
|||
|
|
@ -0,0 +1,368 @@
|
|||
|
||||
#define BITONIC_STEP(dtype) \
|
||||
void bitonic_step_##dtype(uint num_stages, int lx, \
|
||||
__local dtype *local_data, __local int *local_indices) \
|
||||
{ \
|
||||
for (uint stage = 0; stage < num_stages + 1; ++stage) \
|
||||
{ \
|
||||
uint signo = (lx >> stage) & 1; \
|
||||
\
|
||||
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
|
||||
{ \
|
||||
uint postShift = (stage - passOfStage); \
|
||||
uint pairDistance = 1 << postShift; \
|
||||
\
|
||||
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
|
||||
uint right_id = left_id + pairDistance; \
|
||||
\
|
||||
int left_idx = local_indices[left_id]; \
|
||||
int right_idx = local_indices[right_id]; \
|
||||
\
|
||||
dtype left_elem = local_data[left_id]; \
|
||||
dtype right_elem = local_data[right_id]; \
|
||||
\
|
||||
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
|
||||
{ \
|
||||
local_data[left_id] = right_elem; \
|
||||
local_data[right_id] = left_elem; \
|
||||
\
|
||||
local_indices[left_id] = right_idx; \
|
||||
local_indices[right_id] = left_idx; \
|
||||
} \
|
||||
\
|
||||
barrier(CLK_LOCAL_MEM_FENCE); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
BITONIC_STEP(int)
|
||||
BITONIC_STEP(uint)
|
||||
|
||||
#define BITONIC_STEP_ASCEND(dtype) \
|
||||
void bitonic_step_ascend_##dtype(uint num_stages, int lx, \
|
||||
__local dtype *p_share_k, __local int *p_share_v) \
|
||||
{ \
|
||||
for (uint stage = 0; stage < num_stages + 1; ++stage) \
|
||||
{ \
|
||||
uint signo = (lx >> stage) & 1; \
|
||||
\
|
||||
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
|
||||
{ \
|
||||
uint postShift = (stage - passOfStage); \
|
||||
uint pairDistance = 1 << postShift; \
|
||||
\
|
||||
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
|
||||
uint right_id = left_id + pairDistance; \
|
||||
\
|
||||
int left_idx = p_share_v[left_id]; \
|
||||
int right_idx = p_share_v[right_id]; \
|
||||
\
|
||||
dtype left_elem = p_share_k[left_id]; \
|
||||
dtype right_elem = p_share_k[right_id]; \
|
||||
\
|
||||
if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \
|
||||
{ \
|
||||
p_share_k[left_id] = right_elem; \
|
||||
p_share_k[right_id] = left_elem; \
|
||||
\
|
||||
p_share_v[left_id] = right_idx; \
|
||||
p_share_v[right_id] = left_idx; \
|
||||
} \
|
||||
\
|
||||
barrier(CLK_LOCAL_MEM_FENCE); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
BITONIC_STEP_ASCEND(int)
|
||||
BITONIC_STEP_ASCEND(uint)
|
||||
|
||||
#define BITONIC_MERGE(dtype) \
|
||||
void bitonic_merge_##dtype(uint num_stages, int lx, \
|
||||
__local dtype *local_data, __local int *local_indices) \
|
||||
{ \
|
||||
uint stage = num_stages; \
|
||||
uint signo = (lx >> stage) & 1; \
|
||||
\
|
||||
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
|
||||
{ \
|
||||
uint postShift = (stage - passOfStage); \
|
||||
uint pairDistance = 1 << postShift; \
|
||||
\
|
||||
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
|
||||
uint right_id = left_id + pairDistance; \
|
||||
\
|
||||
int left_idx = local_indices[left_id]; \
|
||||
int right_idx = local_indices[right_id]; \
|
||||
\
|
||||
dtype left_elem = local_data[left_id]; \
|
||||
dtype right_elem = local_data[right_id]; \
|
||||
\
|
||||
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
|
||||
{ \
|
||||
local_data[left_id] = right_elem; \
|
||||
local_data[right_id] = left_elem; \
|
||||
\
|
||||
local_indices[left_id] = right_idx; \
|
||||
local_indices[right_id] = left_idx; \
|
||||
} \
|
||||
\
|
||||
barrier(CLK_LOCAL_MEM_FENCE); \
|
||||
} \
|
||||
}
|
||||
BITONIC_MERGE(int)
|
||||
BITONIC_MERGE(uint)
|
||||
|
||||
#define BLOCK_SIZE (512)
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32
|
||||
(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__write_only image2d_t indices,
|
||||
float input_scale,
|
||||
float input_tail,
|
||||
float output_scale,
|
||||
float output_tail,
|
||||
int _num_stages,
|
||||
int width
|
||||
)
|
||||
{
|
||||
uint lx = get_local_id(0);
|
||||
const int init_k = -2147483647;
|
||||
const int init_v = -2147483647;
|
||||
const int num_stages = 9;
|
||||
const int threads_per_block = BLOCK_SIZE;
|
||||
const int index_minus_1 = threads_per_block * 2 - 1;
|
||||
uint offset = 0;
|
||||
uint lx1 = lx + threads_per_block;
|
||||
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
__local int local_data[1536];
|
||||
__local int local_indices[1536];
|
||||
|
||||
int left = read_imagei(input, coord.xy).x;
|
||||
coord.z += threads_per_block;
|
||||
int right = read_imagei(input, coord.zy).x;
|
||||
|
||||
local_data[lx] = left;
|
||||
local_indices[lx] = coord.x;
|
||||
local_data[lx1] = right;
|
||||
local_indices[lx1] = coord.z;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
bitonic_step_int(num_stages, lx, local_data, local_indices);
|
||||
|
||||
int min_data = local_data[511];
|
||||
|
||||
int *p_share_k = local_data + threads_per_block;
|
||||
int *p_share_v = local_indices + threads_per_block;
|
||||
|
||||
int limit = (width >> 10) << 10;
|
||||
p_share_k[lx] = init_k;
|
||||
p_share_v[lx] = init_v;
|
||||
|
||||
p_share_k[lx1] = init_k;
|
||||
p_share_v[lx1] = init_v;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
|
||||
{
|
||||
int2 data;
|
||||
coord.z = coord.x + threads_per_block;
|
||||
data.x = read_imagei(input, coord.xy).x;
|
||||
data.y = read_imagei(input, coord.zy).x;
|
||||
|
||||
p_share_k[lx] = data.x;
|
||||
p_share_v[lx] = coord.x;
|
||||
|
||||
p_share_k[lx1] = data.y;
|
||||
p_share_v[lx1] = coord.z;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
|
||||
|
||||
if (p_share_k[index_minus_1] < min_data)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
p_share_k[lx] = p_share_k[lx1];
|
||||
p_share_v[lx] = p_share_v[lx1];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
bitonic_merge_int(num_stages, lx, local_data, local_indices);
|
||||
|
||||
min_data = local_data[511];
|
||||
p_share_k[lx] = init_k;
|
||||
p_share_v[lx] = init_v;
|
||||
p_share_k[lx1] = init_k;
|
||||
p_share_v[lx1] = init_v;
|
||||
}
|
||||
|
||||
if (width > limit)
|
||||
{
|
||||
if (coord.x < width)
|
||||
{
|
||||
int2 data;
|
||||
data.x = read_imagei(input, coord.xy).x;
|
||||
coord.z = coord.x + threads_per_block;
|
||||
data.y = read_imagei(input, coord.zy).x;
|
||||
|
||||
p_share_k[lx] = data.x;
|
||||
p_share_v[lx] = coord.x;
|
||||
|
||||
p_share_k[lx1] = coord.z < width ? data.y : init_k;
|
||||
p_share_v[lx1] = coord.z < width ? coord.z : init_v;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
|
||||
|
||||
if (p_share_k[index_minus_1] >= min_data)
|
||||
{
|
||||
p_share_k[lx] = p_share_k[lx1];
|
||||
p_share_v[lx] = p_share_v[lx1];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
bitonic_merge_int(num_stages, lx, local_data, local_indices);
|
||||
}
|
||||
}
|
||||
|
||||
int4 dst;
|
||||
dst.x = local_data[lx];
|
||||
|
||||
coord.x = lx;
|
||||
write_imagei(output, coord.xy, dst.xxxx);
|
||||
|
||||
int4 index;
|
||||
index.x = local_indices[lx];
|
||||
|
||||
write_imagei(indices, coord.xy, index.xxxx);
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32
|
||||
(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__write_only image2d_t indices,
|
||||
float input_scale,
|
||||
float input_tail,
|
||||
float output_scale,
|
||||
float output_tail,
|
||||
int _num_stages,
|
||||
int width
|
||||
)
|
||||
{
|
||||
uint lx = get_local_id(0);
|
||||
const uint init_k = 0;
|
||||
const int init_v = -2147483647;
|
||||
const int num_stages = 9;
|
||||
const int threads_per_block = BLOCK_SIZE;
|
||||
const int index_minus_1 = threads_per_block * 2 - 1;
|
||||
uint offset = 0;
|
||||
uint lx1 = lx + threads_per_block;
|
||||
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
__local uint local_data[1536];
|
||||
__local int local_indices[1536];
|
||||
|
||||
uint left = read_imageui(input, coord.xy).x;
|
||||
coord.z += threads_per_block;
|
||||
uint right = read_imageui(input, coord.zy).x;
|
||||
|
||||
local_data[lx] = left;
|
||||
local_indices[lx] = coord.x;
|
||||
local_data[lx1] = right;
|
||||
local_indices[lx1] = coord.z;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
bitonic_step_uint(num_stages, lx, local_data, local_indices);
|
||||
|
||||
uint min_data = local_data[511];
|
||||
|
||||
uint *p_share_k = local_data + threads_per_block;
|
||||
int *p_share_v = local_indices + threads_per_block;
|
||||
|
||||
int limit = (width >> 10) << 10;
|
||||
p_share_k[lx] = init_k;
|
||||
p_share_v[lx] = init_v;
|
||||
|
||||
p_share_k[lx1] = init_k;
|
||||
p_share_v[lx1] = init_v;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
|
||||
{
|
||||
uint2 data;
|
||||
coord.z = coord.x + threads_per_block;
|
||||
data.x = read_imageui(input, coord.xy).x;
|
||||
data.y = read_imageui(input, coord.zy).x;
|
||||
|
||||
p_share_k[lx] = data.x;
|
||||
p_share_v[lx] = coord.x;
|
||||
|
||||
p_share_k[lx1] = data.y;
|
||||
p_share_v[lx1] = coord.z;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
|
||||
|
||||
if (p_share_k[index_minus_1] < min_data)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
p_share_k[lx] = p_share_k[lx1];
|
||||
p_share_v[lx] = p_share_v[lx1];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
bitonic_merge_uint(num_stages, lx, local_data, local_indices);
|
||||
|
||||
min_data = local_data[511];
|
||||
p_share_k[lx] = init_k;
|
||||
p_share_v[lx] = init_v;
|
||||
p_share_k[lx1] = init_k;
|
||||
p_share_v[lx1] = init_v;
|
||||
}
|
||||
|
||||
if (width > limit)
|
||||
{
|
||||
if (coord.x < width)
|
||||
{
|
||||
uint2 data;
|
||||
data.x = read_imageui(input, coord.xy).x;
|
||||
coord.z = coord.x + threads_per_block;
|
||||
data.y = read_imageui(input, coord.zy).x;
|
||||
|
||||
p_share_k[lx] = data.x;
|
||||
p_share_v[lx] = coord.x;
|
||||
|
||||
p_share_k[lx1] = coord.z < width ? data.y : init_k;
|
||||
p_share_v[lx1] = coord.z < width ? coord.z : init_v;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
|
||||
|
||||
if (p_share_k[index_minus_1] >= min_data)
|
||||
{
|
||||
p_share_k[lx] = p_share_k[lx1];
|
||||
p_share_v[lx] = p_share_v[lx1];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
bitonic_merge_uint(num_stages, lx, local_data, local_indices);
|
||||
}
|
||||
}
|
||||
|
||||
uint4 dst;
|
||||
dst.x = local_data[lx];
|
||||
|
||||
coord.x = lx;
|
||||
write_imageui(output, coord.xy, dst.xxxx);
|
||||
|
||||
int4 index;
|
||||
index.x = local_indices[lx];
|
||||
|
||||
write_imagei(indices, coord.xy, index.xxxx);
|
||||
}
|
||||
|
|
@ -28,12 +28,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
}
|
||||
|
||||
__local int sorted[1];
|
||||
int width_minus_one = width - 1;
|
||||
int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
|
||||
num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
|
||||
|
||||
int x_start = lid * num_pixels_per_thread;
|
||||
int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
|
||||
|
||||
sorted[0] = 0;
|
||||
|
||||
|
|
@ -44,20 +38,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
*sorted = 0;
|
||||
}
|
||||
int swapped = 0;
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// odd-even
|
||||
coord.x = x_start;
|
||||
coord.z = x_start + 1;
|
||||
for (; coord.x < x_end; )
|
||||
coord.x = lid * 2;
|
||||
coord.z = lid * 2 + 1;
|
||||
for (; coord.z < width; )
|
||||
{
|
||||
float4 left = read_imagef(input_t, coord.xy);
|
||||
float4 right = read_imagef(input_t, coord.zy);
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
|
||||
if (left.x < right.x)
|
||||
if ( (left.x < right.x) ||
|
||||
(left.x == right.x && l_index.x < r_index.x) )
|
||||
{
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
swapped = 1;
|
||||
|
||||
write_imagef(input_t, coord.xy, right);
|
||||
|
|
@ -67,21 +62,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
write_imagei(indices_t, coord.zy, l_index);
|
||||
}
|
||||
|
||||
coord.xz = coord.xz + 2;
|
||||
coord.xz += 2 * LOCAL_SIZE_X;
|
||||
}
|
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
// even-odd
|
||||
coord.x = x_start + 1;
|
||||
coord.z = x_start + 2;
|
||||
for (; coord.x < x_end; )
|
||||
coord.x = lid * 2 + 1;
|
||||
coord.z = lid * 2 + 2;
|
||||
for (; coord.z < width; )
|
||||
{
|
||||
float4 left = read_imagef(input_t, coord.xy);
|
||||
float4 right = read_imagef(input_t, coord.zy);
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
|
||||
if (left.x < right.x)
|
||||
if ( (left.x < right.x) ||
|
||||
(left.x == right.x && l_index.x < r_index.x) )
|
||||
{
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
swapped = 1;
|
||||
|
||||
write_imagef(input_t, coord.xy, right);
|
||||
|
|
@ -91,11 +88,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
write_imagei(indices_t, coord.zy, l_index);
|
||||
}
|
||||
|
||||
coord.xz = coord.xz + 2;
|
||||
coord.xz += 2 * LOCAL_SIZE_X;
|
||||
}
|
||||
|
||||
atomic_add(sorted, swapped);
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (*sorted == 0)
|
||||
break;
|
||||
|
|
@ -141,13 +138,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
}
|
||||
|
||||
__local int sorted[1];
|
||||
int width_minus_one = width - 1;
|
||||
int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
|
||||
num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
|
||||
|
||||
int x_start = lid * num_pixels_per_thread;
|
||||
int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
|
||||
|
||||
sorted[0] = 0;
|
||||
|
||||
while (1)
|
||||
|
|
@ -157,20 +147,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
*sorted = 0;
|
||||
}
|
||||
int swapped = 0;
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// odd-even
|
||||
coord.x = x_start;
|
||||
coord.z = x_start + 1;
|
||||
for (; coord.x < x_end; )
|
||||
coord.x = lid * 2;
|
||||
coord.z = lid * 2 + 1;
|
||||
for (; coord.z < width; )
|
||||
{
|
||||
uint4 left = read_imageui(input_t, coord.xy);
|
||||
uint4 right = read_imageui(input_t, coord.zy);
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
|
||||
if (left.x < right.x)
|
||||
if ( (left.x < right.x) ||
|
||||
(left.x == right.x && l_index.x < r_index.x) )
|
||||
{
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
swapped = 1;
|
||||
|
||||
write_imageui(input_t, coord.xy, right);
|
||||
|
|
@ -180,21 +171,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
write_imagei(indices_t, coord.zy, l_index);
|
||||
}
|
||||
|
||||
coord.xz = coord.xz + 2;
|
||||
coord.xz += 2 * LOCAL_SIZE_X;
|
||||
}
|
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
// even-odd
|
||||
coord.x = x_start + 1;
|
||||
coord.z = x_start + 2;
|
||||
for (; coord.x < x_end; )
|
||||
coord.x = lid * 2 + 1;
|
||||
coord.z = lid * 2 + 2;
|
||||
for (; coord.z < width; )
|
||||
{
|
||||
uint4 left = read_imageui(input_t, coord.xy);
|
||||
uint4 right = read_imageui(input_t, coord.zy);
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
|
||||
if (left.x < right.x)
|
||||
if ( (left.x < right.x) ||
|
||||
(left.x == right.x && l_index.x < r_index.x) )
|
||||
{
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
swapped = 1;
|
||||
|
||||
write_imageui(input_t, coord.xy, right);
|
||||
|
|
@ -204,11 +197,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
write_imagei(indices_t, coord.zy, l_index);
|
||||
}
|
||||
|
||||
coord.xz = coord.xz + 2;
|
||||
coord.xz += 2 * LOCAL_SIZE_X;
|
||||
}
|
||||
|
||||
atomic_add(sorted, swapped);
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (*sorted == 0)
|
||||
break;
|
||||
|
|
@ -254,13 +247,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
}
|
||||
|
||||
__local int sorted[1];
|
||||
int width_minus_one = width - 1;
|
||||
int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
|
||||
num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
|
||||
|
||||
int x_start = lid * num_pixels_per_thread;
|
||||
int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
|
||||
|
||||
sorted[0] = 0;
|
||||
|
||||
while (1)
|
||||
|
|
@ -270,20 +256,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
*sorted = 0;
|
||||
}
|
||||
int swapped = 0;
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// odd-even
|
||||
coord.x = x_start;
|
||||
coord.z = x_start + 1;
|
||||
for (; coord.x < x_end; )
|
||||
coord.x = lid * 2;
|
||||
coord.z = lid * 2 + 1;
|
||||
for (; coord.z < width; )
|
||||
{
|
||||
int4 left = read_imagei(input_t, coord.xy);
|
||||
int4 right = read_imagei(input_t, coord.zy);
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
|
||||
if (left.x < right.x)
|
||||
if ( (left.x < right.x) ||
|
||||
(left.x == right.x && l_index.x < r_index.x) )
|
||||
{
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
swapped = 1;
|
||||
|
||||
write_imagei(input_t, coord.xy, right);
|
||||
|
|
@ -293,21 +280,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
write_imagei(indices_t, coord.zy, l_index);
|
||||
}
|
||||
|
||||
coord.xz = coord.xz + 2;
|
||||
coord.xz += 2 * LOCAL_SIZE_X;
|
||||
}
|
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
// even-odd
|
||||
coord.x = x_start + 1;
|
||||
coord.z = x_start + 2;
|
||||
for (; coord.x < x_end; )
|
||||
coord.x = lid * 2 + 1;
|
||||
coord.z = lid * 2 + 2;
|
||||
for (; coord.z < width; )
|
||||
{
|
||||
int4 left = read_imagei(input_t, coord.xy);
|
||||
int4 right = read_imagei(input_t, coord.zy);
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
|
||||
if (left.x < right.x)
|
||||
if ( (left.x < right.x) ||
|
||||
(left.x == right.x && l_index.x < r_index.x) )
|
||||
{
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
swapped = 1;
|
||||
|
||||
write_imagei(input_t, coord.xy, right);
|
||||
|
|
@ -317,11 +306,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
write_imagei(indices_t, coord.zy, l_index);
|
||||
}
|
||||
|
||||
coord.xz = coord.xz + 2;
|
||||
coord.xz += 2 * LOCAL_SIZE_X;
|
||||
}
|
||||
|
||||
atomic_add(sorted, swapped);
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (*sorted == 0)
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -28,12 +28,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
}
|
||||
|
||||
__local int sorted[1];
|
||||
int width_minus_one = width - 1;
|
||||
int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
|
||||
num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
|
||||
|
||||
int x_start = lid * num_pixels_per_thread;
|
||||
int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
|
||||
|
||||
sorted[0] = 0;
|
||||
|
||||
|
|
@ -44,20 +38,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
*sorted = 0;
|
||||
}
|
||||
int swapped = 0;
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// odd-even
|
||||
coord.x = x_start;
|
||||
coord.z = x_start + 1;
|
||||
for (; coord.x < x_end; )
|
||||
coord.x = lid * 2;
|
||||
coord.z = lid * 2 + 1;
|
||||
for (; coord.z < width; )
|
||||
{
|
||||
float4 left = read_imagef(input_t, coord.xy);
|
||||
float4 right = read_imagef(input_t, coord.zy);
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
|
||||
if (left.x < right.x)
|
||||
if ( (left.x < right.x) ||
|
||||
(left.x == right.x && l_index.x < r_index.x) )
|
||||
{
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
swapped = 1;
|
||||
|
||||
write_imagef(input_t, coord.xy, right);
|
||||
|
|
@ -67,21 +62,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
write_imagei(indices_t, coord.zy, l_index);
|
||||
}
|
||||
|
||||
coord.xz = coord.xz + 2;
|
||||
coord.xz += 2 * LOCAL_SIZE_X;
|
||||
}
|
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
// even-odd
|
||||
coord.x = x_start + 1;
|
||||
coord.z = x_start + 2;
|
||||
for (; coord.x < x_end; )
|
||||
coord.x = lid * 2 + 1;
|
||||
coord.z = lid * 2 + 2;
|
||||
for (; coord.z < width; )
|
||||
{
|
||||
float4 left = read_imagef(input_t, coord.xy);
|
||||
float4 right = read_imagef(input_t, coord.zy);
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
|
||||
if (left.x < right.x)
|
||||
if ( (left.x < right.x) ||
|
||||
(left.x == right.x && l_index.x < r_index.x) )
|
||||
{
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
swapped = 1;
|
||||
|
||||
write_imagef(input_t, coord.xy, right);
|
||||
|
|
@ -91,11 +88,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
write_imagei(indices_t, coord.zy, l_index);
|
||||
}
|
||||
|
||||
coord.xz = coord.xz + 2;
|
||||
coord.xz += 2 * LOCAL_SIZE_X;
|
||||
}
|
||||
|
||||
atomic_add(sorted, swapped);
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (*sorted == 0)
|
||||
break;
|
||||
|
|
@ -143,13 +140,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
}
|
||||
|
||||
__local int sorted[1];
|
||||
int width_minus_one = width - 1;
|
||||
int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
|
||||
num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
|
||||
|
||||
int x_start = lid * num_pixels_per_thread;
|
||||
int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
|
||||
|
||||
sorted[0] = 0;
|
||||
|
||||
while (1)
|
||||
|
|
@ -159,20 +149,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
*sorted = 0;
|
||||
}
|
||||
int swapped = 0;
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// odd-even
|
||||
coord.x = x_start;
|
||||
coord.z = x_start + 1;
|
||||
for (; coord.x < x_end; )
|
||||
coord.x = lid * 2;
|
||||
coord.z = lid * 2 + 1;
|
||||
for (; coord.z < width; )
|
||||
{
|
||||
float4 left = read_imagef(input_t, coord.xy);
|
||||
float4 right = read_imagef(input_t, coord.zy);
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
|
||||
if (left.x < right.x)
|
||||
if ( (left.x < right.x) ||
|
||||
(left.x == right.x && l_index.x < r_index.x) )
|
||||
{
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
swapped = 1;
|
||||
|
||||
write_imagef(input_t, coord.xy, right);
|
||||
|
|
@ -182,18 +173,22 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
write_imagei(indices_t, coord.zy, l_index);
|
||||
}
|
||||
|
||||
coord.xz = coord.xz + 2;
|
||||
coord.xz += 2 * LOCAL_SIZE_X;
|
||||
}
|
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
// even-odd
|
||||
coord.x = x_start + 1;
|
||||
coord.z = x_start + 2;
|
||||
for (; coord.x < x_end; )
|
||||
coord.x = lid * 2 + 1;
|
||||
coord.z = lid * 2 + 2;
|
||||
for (; coord.z < width; )
|
||||
{
|
||||
float4 left = read_imagef(input_t, coord.xy);
|
||||
float4 right = read_imagef(input_t, coord.zy);
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
|
||||
if (left.x < right.x)
|
||||
if ( (left.x < right.x) ||
|
||||
(left.x == right.x && l_index.x < r_index.x) )
|
||||
{
|
||||
int4 l_index = read_imagei(indices_t, coord.xy);
|
||||
int4 r_index = read_imagei(indices_t, coord.zy);
|
||||
|
|
@ -206,11 +201,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
|
|||
write_imagei(indices_t, coord.zy, l_index);
|
||||
}
|
||||
|
||||
coord.xz = coord.xz + 2;
|
||||
coord.xz += 2 * LOCAL_SIZE_X;
|
||||
}
|
||||
|
||||
atomic_add(sorted, swapped);
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (*sorted == 0)
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,171 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float2 half_input0_wh;
|
||||
_viv_uniform float2 add_float_value;
|
||||
_viv_uniform int depth;
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;
|
||||
_viv_uniform float4 span_wh;
|
||||
_viv_uniform float4 min_val_wh;
|
||||
|
||||
#define GRID_SAMPLE_BF16_PROCESS() \
|
||||
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy0 = fabs(fxy0 - min_val_wh); \
|
||||
fxy1 = fabs(fxy1 - min_val_wh); \
|
||||
float4 flips_xy0 = floor(fxy0 / span_wh); \
|
||||
float4 flips_xy1 = floor(fxy1 / span_wh); \
|
||||
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
|
||||
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
|
||||
int4 flips_int_xy0 = convert_int4(flips_xy0); \
|
||||
int4 flips_int_xy1 = convert_int4(flips_xy1); \
|
||||
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
|
||||
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
|
||||
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
|
||||
float4 x_f = floor(in_x); \
|
||||
float4 x_lerp = in_x - x_f; \
|
||||
int4 x_idx = convert_int4(x_f); \
|
||||
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
|
||||
float4 y_f = floor(in_y); \
|
||||
float4 y_lerp = in_y - y_f; \
|
||||
int4 y_idx = convert_int4(y_f); \
|
||||
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
|
||||
int baseAddr = input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
vxc_short8 top; \
|
||||
vxc_short8 bottom; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
float4 left4; \
|
||||
float4 right4; \
|
||||
float4 top4; \
|
||||
float4 bottom4; \
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
int loop = depth - 1; \
|
||||
vxc_ushort8 tmp, dst; \
|
||||
while (coord_in.z < loop) \
|
||||
{ \
|
||||
VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
|
||||
_viv_asm(COPY, right4, src, 16); \
|
||||
VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
|
||||
_viv_asm(COPY, left4, src, 16); \
|
||||
right4 -= left4; \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
|
||||
_viv_asm(COPY, right4, src, 16); \
|
||||
VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
|
||||
_viv_asm(COPY, left4, src, 16); \
|
||||
right4 -= left4; \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
_viv_asm(COPY, tmp, dst4, 16); \
|
||||
dst.s0123 = tmp.s1357; \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
|
||||
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
|
||||
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
|
||||
coord_in.x = x_idx.x; \
|
||||
coord_in.y = y_idx.x; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
} \
|
||||
VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
|
||||
_viv_asm(COPY, right4, src, 16); \
|
||||
VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
|
||||
_viv_asm(COPY, left4, src, 16); \
|
||||
right4 -= left4; \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
|
||||
_viv_asm(COPY, right4, src, 16); \
|
||||
VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
|
||||
_viv_asm(COPY, left4, src, 16); \
|
||||
right4 -= left4; \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
_viv_asm(COPY, tmp, dst4, 16); \
|
||||
dst.s0123 = tmp.s1357; \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
|
||||
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_BF16_BF16toBF16(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
|
||||
vxc_short8 read_val;
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
|
||||
float4 fxy0;
|
||||
float4 fxy1;
|
||||
|
||||
vxc_short8 src;
|
||||
VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);
|
||||
_viv_asm(COPY, fxy0, src, 16);
|
||||
VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);
|
||||
_viv_asm(COPY, fxy1, src, 16);
|
||||
|
||||
|
||||
|
||||
GRID_SAMPLE_BF16_PROCESS();
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,217 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float2 half_input0_wh;
|
||||
_viv_uniform float2 add_float_value;
|
||||
_viv_uniform int depth;
|
||||
_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
|
||||
_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
|
||||
_viv_uniform float4 span_wh;
|
||||
_viv_uniform float4 min_val_wh;
|
||||
|
||||
#define GRID_SAMPLE_F16_PROCESS() \
|
||||
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy0 = fabs(fxy0 - min_val_wh); \
|
||||
fxy1 = fabs(fxy1 - min_val_wh); \
|
||||
float4 flips_xy0 = floor(fxy0 / span_wh); \
|
||||
float4 flips_xy1 = floor(fxy1 / span_wh); \
|
||||
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
|
||||
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
|
||||
int4 flips_int_xy0 = convert_int4(flips_xy0); \
|
||||
int4 flips_int_xy1 = convert_int4(flips_xy1); \
|
||||
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
|
||||
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
|
||||
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
|
||||
float4 x_f = floor(in_x); \
|
||||
float4 x_lerp = in_x - x_f; \
|
||||
int4 x_idx = convert_int4(x_f); \
|
||||
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
|
||||
float4 y_f = floor(in_y); \
|
||||
float4 y_lerp = in_y - y_f; \
|
||||
int4 y_idx = convert_int4(y_f); \
|
||||
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
|
||||
int baseAddr = input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
vxc_short8 t0; \
|
||||
vxc_short8 b0; \
|
||||
vxc_half8 top; \
|
||||
vxc_half8 bottom; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, top, t0, 16); \
|
||||
_viv_asm(COPY, bottom, b0, 16); \
|
||||
float4 left4; \
|
||||
float4 right4; \
|
||||
float4 top4; \
|
||||
float4 bottom4; \
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
int loop = depth - 1; \
|
||||
while (coord_in.z < loop) \
|
||||
{ \
|
||||
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
|
||||
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
|
||||
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
half4 tmp; \
|
||||
_viv_asm(CONV, tmp, dst4); \
|
||||
VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \
|
||||
vxc_short4 result; \
|
||||
_viv_asm(COPY, result, top, 8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
|
||||
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
|
||||
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
|
||||
coord_in.x = x_idx.x; \
|
||||
coord_in.y = y_idx.x; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, top, t0, 16); \
|
||||
_viv_asm(COPY, bottom, b0, 16); \
|
||||
} \
|
||||
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
|
||||
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
|
||||
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
half4 tmp; \
|
||||
_viv_asm(CONV, tmp, dst4); \
|
||||
VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \
|
||||
vxc_short4 result; \
|
||||
_viv_asm(COPY, result, top, 8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_F16_F32toF16(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
coord_in1.z = coord_in1.z + 4;
|
||||
|
||||
float4 fxy0 = read_imagef(input1, coord_in1.xy);
|
||||
float4 fxy1 = read_imagef(input1, coord_in1.zw);
|
||||
|
||||
GRID_SAMPLE_F16_PROCESS();
|
||||
|
||||
}
|
||||
|
||||
_viv_uniform int input1_ZP;
|
||||
_viv_uniform float input1Scale;
|
||||
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
|
||||
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_F16_U8toF16(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
vxc_uchar16 read_coord;
|
||||
VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
float4 fxy0;
|
||||
float4 fxy1;
|
||||
unsigned char input1ZP;
|
||||
_viv_asm(COPY, input1ZP, input1_ZP, 4);
|
||||
VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
|
||||
VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
|
||||
fxy0 = fxy0 * input1Scale;
|
||||
fxy1 = fxy1 * input1Scale;
|
||||
|
||||
GRID_SAMPLE_F16_PROCESS();
|
||||
|
||||
}
|
||||
|
||||
|
||||
_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
|
||||
_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_F16_F16toF16(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
|
||||
vxc_short8 read_val;
|
||||
vxc_half8 read_coord;
|
||||
|
||||
VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
_viv_asm(COPY, read_coord, read_val, 16);
|
||||
|
||||
float4 fxy0;
|
||||
float4 fxy1;
|
||||
|
||||
VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
|
||||
VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
|
||||
|
||||
GRID_SAMPLE_F16_PROCESS();
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,224 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float2 half_input0_wh;
|
||||
_viv_uniform float2 add_float_value;
|
||||
_viv_uniform int depth;
|
||||
_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
|
||||
_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
|
||||
_viv_uniform float uint8Scale;
|
||||
_viv_uniform float output_ZP;
|
||||
_viv_uniform float4 span_wh;
|
||||
_viv_uniform float4 min_val_wh;
|
||||
|
||||
#define GRID_SAMPLE_F16_to_U8_PROCESS() \
|
||||
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy0 = fabs(fxy0 - min_val_wh); \
|
||||
fxy1 = fabs(fxy1 - min_val_wh); \
|
||||
float4 flips_xy0 = floor(fxy0 / span_wh); \
|
||||
float4 flips_xy1 = floor(fxy1 / span_wh); \
|
||||
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
|
||||
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
|
||||
int4 flips_int_xy0 = convert_int4(flips_xy0); \
|
||||
int4 flips_int_xy1 = convert_int4(flips_xy1); \
|
||||
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
|
||||
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
|
||||
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
|
||||
float4 x_f = floor(in_x); \
|
||||
float4 x_lerp = in_x - x_f; \
|
||||
int4 x_idx = convert_int4(x_f); \
|
||||
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
|
||||
float4 y_f = floor(in_y); \
|
||||
float4 y_lerp = in_y - y_f; \
|
||||
int4 y_idx = convert_int4(y_f); \
|
||||
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
|
||||
int baseAddr = input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
vxc_short8 t0; \
|
||||
vxc_short8 b0; \
|
||||
vxc_uchar16 result; \
|
||||
vxc_half8 top; \
|
||||
vxc_half8 bottom; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, top, t0, 16); \
|
||||
_viv_asm(COPY, bottom, b0, 16); \
|
||||
float4 left4; \
|
||||
float4 right4; \
|
||||
float4 top4; \
|
||||
float4 bottom4; \
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
int loop = depth - 1; \
|
||||
while (coord_in.z < loop) \
|
||||
{ \
|
||||
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
|
||||
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
|
||||
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
dst4 = dst4 * uint8Scale + output_ZP; \
|
||||
int4 dst = convert_int4_rte(dst4); \
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||
result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
|
||||
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
|
||||
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
|
||||
coord_in.x = x_idx.x; \
|
||||
coord_in.y = y_idx.x; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, top, t0, 16); \
|
||||
_viv_asm(COPY, bottom, b0, 16); \
|
||||
} \
|
||||
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
|
||||
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
|
||||
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
dst4 = dst4 * uint8Scale + output_ZP; \
|
||||
int4 dst = convert_int4_rte(dst4); \
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||
result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_F16_F32toU8(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
coord_in1.z = coord_in1.z + 4;
|
||||
|
||||
float4 fxy0 = read_imagef(input1, coord_in1.xy);
|
||||
float4 fxy1 = read_imagef(input1, coord_in1.zw);
|
||||
GRID_SAMPLE_F16_to_U8_PROCESS();
|
||||
|
||||
}
|
||||
|
||||
_viv_uniform int input1_ZP;
|
||||
_viv_uniform float input1Scale;
|
||||
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
|
||||
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
|
||||
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_F16_U8toU8(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_array_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
|
||||
vxc_uchar16 read_coord;
|
||||
|
||||
VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
float4 fxy0;
|
||||
float4 fxy1;
|
||||
|
||||
unsigned char input1ZP;
|
||||
_viv_asm(COPY, input1ZP, input1_ZP, 4);
|
||||
|
||||
VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
|
||||
VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
|
||||
|
||||
fxy0 = fxy0 * input1Scale;
|
||||
fxy1 = fxy1 * input1Scale;
|
||||
|
||||
GRID_SAMPLE_F16_to_U8_PROCESS();
|
||||
|
||||
}
|
||||
|
||||
_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
|
||||
_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_F16_F16toU8(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
|
||||
vxc_short8 read_val;
|
||||
vxc_half8 read_coord;
|
||||
|
||||
VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
_viv_asm(COPY, read_coord, read_val, 16);
|
||||
|
||||
float4 fxy0;
|
||||
float4 fxy1;
|
||||
|
||||
VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
|
||||
VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
|
||||
|
||||
GRID_SAMPLE_F16_to_U8_PROCESS();
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,160 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float2 half_input0_wh;
|
||||
_viv_uniform float2 add_float_value;
|
||||
_viv_uniform int depth;
|
||||
_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
|
||||
_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
|
||||
_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
|
||||
_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
|
||||
_viv_uniform float input1_scale;
|
||||
_viv_uniform float dfpScale;
|
||||
_viv_uniform float4 span_wh;
|
||||
_viv_uniform float4 min_val_wh;
|
||||
|
||||
#define GRID_SAMPLE_I16_PROCESS() \
|
||||
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy0 = fabs(fxy0 - min_val_wh); \
|
||||
fxy1 = fabs(fxy1 - min_val_wh); \
|
||||
float4 flips_xy0 = floor(fxy0 / span_wh); \
|
||||
float4 flips_xy1 = floor(fxy1 / span_wh); \
|
||||
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
|
||||
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
|
||||
int4 flips_int_xy0 = convert_int4(flips_xy0); \
|
||||
int4 flips_int_xy1 = convert_int4(flips_xy1); \
|
||||
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
|
||||
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
|
||||
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
|
||||
float4 x_f = floor(in_x); \
|
||||
float4 x_lerp = in_x - x_f; \
|
||||
int4 x_idx = convert_int4(x_f); \
|
||||
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
|
||||
float4 y_f = floor(in_y); \
|
||||
float4 y_lerp = in_y - y_f; \
|
||||
int4 y_idx = convert_int4(y_f); \
|
||||
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
|
||||
int baseAddr = input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
vxc_short8 top; \
|
||||
vxc_short8 bottom; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
float4 left4; \
|
||||
float4 right4; \
|
||||
float4 top4; \
|
||||
float4 bottom4; \
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
int loop = depth - 1; \
|
||||
while (coord_in.z < loop) \
|
||||
{ \
|
||||
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP4x4(left4, bottom, bottom, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, bottom, bottom, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
dst4 = dst4 * dfpScale; \
|
||||
int4 dst = convert_int4_rte(dst4); \
|
||||
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
|
||||
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
|
||||
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
|
||||
coord_in.x = x_idx.x; \
|
||||
coord_in.y = y_idx.x; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
} \
|
||||
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP4x4(left4, bottom, bottom, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, bottom, bottom, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
dst4 = dst4 * dfpScale; \
|
||||
int4 dst = convert_int4_rte(dst4); \
|
||||
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_I16_I16toI16(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
vxc_short8 read_coord;
|
||||
VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
float4 fxy0;
|
||||
float4 fxy1;
|
||||
|
||||
VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
|
||||
VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
|
||||
|
||||
fxy0 = fxy0 * input1_scale;
|
||||
fxy1 = fxy1 * input1_scale;
|
||||
|
||||
GRID_SAMPLE_I16_PROCESS();
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,160 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float2 half_input0_wh;
|
||||
_viv_uniform float2 add_float_value;
|
||||
_viv_uniform int depth;
|
||||
_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
|
||||
_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
|
||||
_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
|
||||
_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
|
||||
_viv_uniform float input1_scale;
|
||||
_viv_uniform float dfpScale;
|
||||
_viv_uniform float4 span_wh;
|
||||
_viv_uniform float4 min_val_wh;
|
||||
|
||||
#define GRID_SAMPLE_I8_PROCESS() \
|
||||
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy0 = fabs(fxy0 - min_val_wh); \
|
||||
fxy1 = fabs(fxy1 - min_val_wh); \
|
||||
float4 flips_xy0 = floor(fxy0 / span_wh); \
|
||||
float4 flips_xy1 = floor(fxy1 / span_wh); \
|
||||
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
|
||||
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
|
||||
int4 flips_int_xy0 = convert_int4(flips_xy0); \
|
||||
int4 flips_int_xy1 = convert_int4(flips_xy1); \
|
||||
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
|
||||
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
|
||||
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
|
||||
float4 x_f = floor(in_x); \
|
||||
float4 x_lerp = in_x - x_f; \
|
||||
int4 x_idx = convert_int4(x_f); \
|
||||
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
|
||||
float4 y_f = floor(in_y); \
|
||||
float4 y_lerp = in_y - y_f; \
|
||||
int4 y_idx = convert_int4(y_f); \
|
||||
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
|
||||
int baseAddr = input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
vxc_char16 top; \
|
||||
vxc_char16 bottom; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
float4 left4; \
|
||||
float4 right4; \
|
||||
float4 top4; \
|
||||
float4 bottom4; \
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
int loop = depth - 1; \
|
||||
while (coord_in.z < loop) \
|
||||
{ \
|
||||
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP4x4(left4, bottom, bottom, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, bottom, bottom, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
dst4 = dst4 * dfpScale; \
|
||||
int4 dst = convert_int4_rte(dst4); \
|
||||
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
|
||||
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
|
||||
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
|
||||
coord_in.x = x_idx.x; \
|
||||
coord_in.y = y_idx.x; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
} \
|
||||
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP4x4(left4, bottom, bottom, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, bottom, bottom, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
dst4 = dst4 * dfpScale; \
|
||||
int4 dst = convert_int4_rte(dst4); \
|
||||
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_I8_I8toI8(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
vxc_char16 read_coord;
|
||||
VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
float4 fxy0;
|
||||
float4 fxy1;
|
||||
|
||||
VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
|
||||
VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
|
||||
|
||||
fxy0 = fxy0 * input1_scale;
|
||||
fxy1 = fxy1 * input1_scale;
|
||||
|
||||
GRID_SAMPLE_I8_PROCESS();
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,224 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float2 half_input0_wh;
|
||||
_viv_uniform float2 add_float_value;
|
||||
_viv_uniform int depth;
|
||||
|
||||
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
|
||||
_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
|
||||
_viv_uniform int input_ZP;
|
||||
_viv_uniform float uint8Scale;
|
||||
_viv_uniform float output_ZP;
|
||||
_viv_uniform int input1_ZP;
|
||||
_viv_uniform float input1Scale;
|
||||
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
|
||||
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
|
||||
_viv_uniform float4 span_wh;
|
||||
_viv_uniform float4 min_val_wh;
|
||||
|
||||
#define GRID_SAMPLE_U8_PROCESS() \
|
||||
fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
|
||||
fxy0 = fabs(fxy0 - min_val_wh); \
|
||||
fxy1 = fabs(fxy1 - min_val_wh); \
|
||||
float4 flips_xy0 = floor(fxy0 / span_wh); \
|
||||
float4 flips_xy1 = floor(fxy1 / span_wh); \
|
||||
float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
|
||||
float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
|
||||
int4 flips_int_xy0 = convert_int4(flips_xy0); \
|
||||
int4 flips_int_xy1 = convert_int4(flips_xy1); \
|
||||
fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
|
||||
fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
|
||||
float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
|
||||
float4 x_f = floor(in_x); \
|
||||
float4 x_lerp = in_x - x_f; \
|
||||
int4 x_idx = convert_int4(x_f); \
|
||||
float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
|
||||
float4 y_f = floor(in_y); \
|
||||
float4 y_lerp = in_y - y_f; \
|
||||
int4 y_idx = convert_int4(y_f); \
|
||||
int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
|
||||
int baseAddr = input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
vxc_uchar16 top; \
|
||||
vxc_uchar16 bottom; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
float4 left4; \
|
||||
float4 right4; \
|
||||
float4 top4; \
|
||||
float4 bottom4; \
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
int loop = depth - 1; \
|
||||
while (coord_in.z < loop) \
|
||||
{ \
|
||||
unsigned char inputZP; \
|
||||
_viv_asm(COPY, inputZP, input_ZP, 4); \
|
||||
VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP4x4(left4, bottom, inputZP, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, bottom, bottom, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
dst4 = dst4 * uint8Scale + output_ZP; \
|
||||
int4 dst = convert_int4_rte(dst4); \
|
||||
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
|
||||
coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
|
||||
coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
|
||||
coord_in.x = x_idx.x; \
|
||||
coord_in.y = y_idx.x; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.y; \
|
||||
coord_in.y = y_idx.y; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.z; \
|
||||
coord_in.y = y_idx.z; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.x = x_idx.w; \
|
||||
coord_in.y = y_idx.w; \
|
||||
VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
|
||||
VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
} \
|
||||
unsigned char inputZP; \
|
||||
_viv_asm(COPY, inputZP, input_ZP, 4); \
|
||||
VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
|
||||
top4 = right4 * x_lerp + left4; \
|
||||
VXC_DP4x4(left4, bottom, inputZP, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
|
||||
VXC_DP4x4(right4, bottom, bottom, \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
|
||||
bottom4 = right4 * x_lerp + left4; \
|
||||
bottom4 -= top4; \
|
||||
float4 dst4 = bottom4 * y_lerp + top4; \
|
||||
dst4 = dst4 * uint8Scale + output_ZP; \
|
||||
int4 dst = convert_int4_rte(dst4); \
|
||||
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_U8_F32toU8(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
coord_in1.z = coord_in1.z + 4;
|
||||
|
||||
float4 fxy0 = read_imagef(input1, coord_in1.xy);
|
||||
float4 fxy1 = read_imagef(input1, coord_in1.zw);
|
||||
GRID_SAMPLE_U8_PROCESS();
|
||||
|
||||
}
|
||||
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_U8_U8toU8(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_array_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
|
||||
vxc_uchar16 read_coord;
|
||||
|
||||
VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
float4 fxy0;
|
||||
float4 fxy1;
|
||||
|
||||
unsigned char input1ZP;
|
||||
_viv_asm(COPY, input1ZP, input1_ZP, 4);
|
||||
|
||||
VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
|
||||
VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
|
||||
|
||||
fxy0 = fxy0 * input1Scale;
|
||||
fxy1 = fxy1 * input1Scale;
|
||||
|
||||
GRID_SAMPLE_U8_PROCESS();
|
||||
|
||||
}
|
||||
|
||||
_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
|
||||
_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
|
||||
|
||||
__kernel void bilinear_grid_sample_reflect_U8_F16toU8(
|
||||
__read_only image2d_array_t input0,
|
||||
__read_only image2d_t input1,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
int4 coord_in1 = coord_out.xyxy;
|
||||
|
||||
coord_in1.xz = coord_in1.xz * 2;
|
||||
|
||||
vxc_short8 read_val;
|
||||
vxc_half8 read_coord;
|
||||
|
||||
VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
_viv_asm(COPY, read_coord, read_val, 16);
|
||||
|
||||
float4 fxy0;
|
||||
float4 fxy1;
|
||||
|
||||
VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
|
||||
VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
|
||||
|
||||
GRID_SAMPLE_U8_PROCESS();
|
||||
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -39,6 +39,7 @@
|
|||
#include "vsi_nn_internal_node.h"
|
||||
#include "utils/vsi_nn_constraint_check.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "utils/vsi_nn_dtype_util_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
|
||||
#define _INPUT_NUM (1)
|
||||
|
|
@ -199,14 +200,11 @@ static vsi_bool op_setup
|
|||
vsi_nn_internal_node_t* curr = NULL;
|
||||
float min = self->nn_param.clip.min;
|
||||
float max = self->nn_param.clip.max;
|
||||
uint32_t infinity = VSI_NN_FLOAT32_INF;
|
||||
float neg_infinity = -*(float*)&infinity;
|
||||
int32_t max_float = *(int32_t*)&max;
|
||||
|
||||
if ( (min == -1.0f && max == 1.0f)
|
||||
|| (min == 0.0f && max == 6.0f)
|
||||
|| (min == 0.0f && max_float == VSI_NN_FLOAT32_INF)
|
||||
|| (min == neg_infinity && max_float == VSI_NN_FLOAT32_INF))
|
||||
|| (min == 0.0f && fp32_is_inf(max))
|
||||
|| (fp32_is_inf(-min) && fp32_is_inf(max)))
|
||||
{
|
||||
vsi_nn_internal_init_node_wksp(self);
|
||||
if (min == -1.0f && max == 1.0f)
|
||||
|
|
@ -217,7 +215,7 @@ static vsi_bool op_setup
|
|||
{
|
||||
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0);
|
||||
}
|
||||
else if (min == 0.0f && max_float == VSI_NN_FLOAT32_INF)
|
||||
else if (min == 0.0f && fp32_is_inf(max))
|
||||
{
|
||||
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -430,10 +430,13 @@ static vsi_bool op_setup
|
|||
// create activation output/hstate_output/cstate_output
|
||||
vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.dtype, TRUE);
|
||||
act_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
|
||||
CHECK_PTR_FAIL_GOTO(act_out, "Create internal tensor failed", final);
|
||||
vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_H_STATE]->attr.dtype, TRUE);
|
||||
act_h_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
|
||||
CHECK_PTR_FAIL_GOTO(act_h_out, "Create internal tensor failed", final);
|
||||
vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_C_STATE]->attr.dtype, TRUE);
|
||||
act_c_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
|
||||
CHECK_PTR_FAIL_GOTO(act_c_out, "Create internal tensor failed", final);
|
||||
curr->outputs[LSTMUNIT_ACT_OUTPUT] = act_out->t;
|
||||
curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = act_h_out->t;
|
||||
curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = act_c_out->t;
|
||||
|
|
|
|||
|
|
@ -89,7 +89,7 @@ static vsi_status op_grouped_compute
|
|||
if (NULL == LOCAL()->weight_tensor_group)
|
||||
{
|
||||
VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
|
||||
return VSI_FAILURE;
|
||||
goto final;
|
||||
}
|
||||
memset(LOCAL()->weight_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
|
||||
res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 2,
|
||||
|
|
@ -325,8 +325,8 @@ static vsi_status op_compute
|
|||
#endif
|
||||
// param.a_x = self->nn_param.deconv.dilation;
|
||||
// param.a_y = self->nn_param.deconv.dilation;
|
||||
param.ext.khr.a_x = 1;
|
||||
param.ext.khr.a_y = 1;
|
||||
param.ext.khr.a_x = (size_t)self->nn_param.deconv.output_padding[0];
|
||||
param.ext.khr.a_y = (size_t)self->nn_param.deconv.output_padding[1];
|
||||
param.ext.khr.padding_x = (size_t)self->nn_param.deconv.pad[0];
|
||||
param.ext.khr.padding_y = (size_t)self->nn_param.deconv.pad[2];
|
||||
param.ext.khr.overflow_policy = self->vx_param.overflow_policy;
|
||||
|
|
@ -336,6 +336,7 @@ static vsi_status op_compute
|
|||
param.ext.channel_group = self->nn_param.deconv.group;
|
||||
param.stride_x = self->nn_param.deconv.stride[0];
|
||||
param.stride_y = self->nn_param.deconv.stride[1];
|
||||
param.down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
|
||||
//param.border_mode;
|
||||
//param.border_const;
|
||||
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ static vsi_status op_compute
|
|||
input_tensor[1] = tmp_tensor;
|
||||
|
||||
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
|
||||
"signal_frame",
|
||||
"extra_ending",
|
||||
input_tensor, 2,
|
||||
outputs, 1, NULL );
|
||||
|
||||
|
|
|
|||
|
|
@ -65,9 +65,15 @@ static vsi_status op_compute
|
|||
}
|
||||
else
|
||||
{
|
||||
#define _TENSOR_LEN 64
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_tensor_t* temp_tensors = NULL;
|
||||
|
||||
char gather_tensor_name[_TENSOR_LEN];
|
||||
char copy_tensor_name[_TENSOR_LEN];
|
||||
memset(gather_tensor_name, 0, sizeof(gather_tensor_name));
|
||||
memset(copy_tensor_name, 0, sizeof(copy_tensor_name));
|
||||
|
||||
VSILOGW("gather is no_range_change operation! \
|
||||
Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
|
||||
|
||||
|
|
@ -78,7 +84,20 @@ static vsi_status op_compute
|
|||
temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
|
||||
|
||||
vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, &temp_tensors, 1, param );
|
||||
snprintf(gather_tensor_name, sizeof(gather_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
|
||||
if(vxSetReferenceName((vx_reference)temp_tensors->t, gather_tensor_name) == VSI_FAILURE)
|
||||
{
|
||||
VSILOGW("Set uid %u gather node output name fail", self->uid);
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
|
||||
snprintf(copy_tensor_name, sizeof(copy_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
|
||||
if(vxSetReferenceName((vx_reference)outputs[0]->t, copy_tensor_name) == VSI_FAILURE)
|
||||
{
|
||||
VSILOGW("Set uid %u copy node output name fail", self->uid);
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
vsi_safe_release_tensor(temp_tensors);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@
|
|||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
#include "utils/vsi_nn_constraint_check.h"
|
||||
#include "vsi_nn_error.h"
|
||||
|
||||
typedef struct _gather_elements_local_data_t {
|
||||
int32_t placeholder;
|
||||
|
|
@ -101,6 +102,7 @@ static vsi_status op_compute
|
|||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
|
||||
CHECK_PTR_FAIL_GOTO( temp_tensors, "Create tensor fail.", final );
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -148,6 +150,7 @@ static vsi_status op_compute
|
|||
vsi_safe_release_tensor(temp_tensors);
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_nn_kernel_param_release( ¶m );
|
||||
|
||||
if ( self->n )
|
||||
|
|
|
|||
|
|
@ -53,11 +53,13 @@ static vsi_status op_compute
|
|||
|
||||
vsi_nn_kernel_param_t* param = NULL;
|
||||
int32_t align_corners = self->nn_param.gridsample.align_corners;
|
||||
int32_t pad_mode = (int32_t)self->nn_param.gridsample.padding_mode;
|
||||
vsi_nn_kernel_node_t n;
|
||||
char kernel_name[128];
|
||||
|
||||
param = vsi_nn_kernel_param_create();
|
||||
vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
|
||||
vsi_nn_kernel_param_add_int32(param, "padding_mode", pad_mode);
|
||||
|
||||
switch (self->nn_param.gridsample.mode) {
|
||||
case VSI_NN_INTERPOLATION_BILINEAR:
|
||||
|
|
@ -103,13 +105,20 @@ static vsi_bool op_check
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
if (!((VSI_NN_PAD_MODE_CONSTANT ==
|
||||
if ((VSI_NN_PAD_MODE_CONSTANT ==
|
||||
self->nn_param.gridsample.padding_mode) &&
|
||||
(0 == self->nn_param.gridsample.const_val))) {
|
||||
(0 != self->nn_param.gridsample.const_val)) {
|
||||
VSILOGE("Only support padding const 0 now!");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
if (VSI_NN_PAD_MODE_SYMMETRIC ==
|
||||
self->nn_param.gridsample.padding_mode) {
|
||||
VSILOGE("Can't support VSI_NN_PAD_MODE_SYMMETRIC now!");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
} /* op_check() */
|
||||
|
||||
|
|
@ -124,6 +133,11 @@ static vsi_bool op_setup
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
if (2 != inputs[1]->attr.size[0])
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) {
|
||||
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
|
||||
outputs[0]->attr.size[0] = inputs[1]->attr.size[1];
|
||||
|
|
@ -133,6 +147,16 @@ static vsi_bool op_setup
|
|||
outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((outputs[0]->attr.dim_num != inputs[0]->attr.dim_num)
|
||||
|| (outputs[0]->attr.size[0] != inputs[1]->attr.size[1])
|
||||
|| (outputs[0]->attr.size[1] != inputs[1]->attr.size[2]))
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
} /* op_setup() */
|
||||
|
|
|
|||
|
|
@ -121,6 +121,7 @@ static vsi_status op_compute
|
|||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
#define _TENSOR_LEN 64
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_param_t * param = NULL;
|
||||
vsi_nn_kernel_node_t n = NULL;
|
||||
|
|
@ -129,15 +130,36 @@ static vsi_status op_compute
|
|||
vsi_size_t new_rank = 0;
|
||||
vsi_nn_tensor_t * tmp_tensors[4] = {NULL};
|
||||
|
||||
char reshape0_tensor_name[_TENSOR_LEN];
|
||||
char reshape1_tensor_name[_TENSOR_LEN];
|
||||
char instance_norm_tensor_name[_TENSOR_LEN];
|
||||
|
||||
memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name));
|
||||
memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name));
|
||||
memset(instance_norm_tensor_name, 0, sizeof(instance_norm_tensor_name));
|
||||
|
||||
vsi_nn_optimize_instance_norm_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);
|
||||
|
||||
tmp_tensors[0] = vsi_nn_kernel_insert_reshape_node( self->graph,
|
||||
inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD );
|
||||
snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
|
||||
if(vxSetReferenceName((vx_reference)tmp_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE)
|
||||
{
|
||||
VSILOGW("Set uid %u reshape 0 node output name fail", self->uid);
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
tmp_tensors[1] = inputs[1];
|
||||
tmp_tensors[2] = inputs[2];
|
||||
tmp_tensors[3] = vsi_nn_kernel_insert_reshape_node( self->graph,
|
||||
outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD );
|
||||
|
||||
snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
|
||||
if(vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE)
|
||||
{
|
||||
VSILOGW("Set uid %u reshap 1 node output name fail", self->uid);
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
status = _try_set_high_presision_tensor(tmp_tensors);
|
||||
if (status != VSI_SUCCESS)
|
||||
{
|
||||
|
|
@ -155,6 +177,12 @@ static vsi_status op_compute
|
|||
self->n = (vx_node)n;
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
snprintf(instance_norm_tensor_name, sizeof(instance_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2);
|
||||
if(vxSetReferenceName((vx_reference)tmp_tensors[3]->t, instance_norm_tensor_name) == VSI_FAILURE)
|
||||
{
|
||||
VSILOGW("Set uid %u instance_norm node output name fail", self->uid);
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
if (param != NULL)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -259,9 +259,9 @@ static vsi_status op_deinit
|
|||
vsi_nn_node_t * self
|
||||
)
|
||||
{
|
||||
vsi_nn_safe_free(self->nn_param.layernorm.local);
|
||||
|
||||
#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
|
||||
vsi_nn_safe_free(self->nn_param.layernorm.local);
|
||||
vsi_nn_internal_deinit_node_wksp( self );
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -161,14 +161,6 @@ static vsi_bool _check_is_sp_supported_type
|
|||
{
|
||||
int32_t * axes = self->nn_param.reduce.local2->axes;
|
||||
int32_t axes_num = self->nn_param.reduce.local2->axes_num;
|
||||
vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = { {0} };
|
||||
int32_t axis_in[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t i = 0;
|
||||
uint32_t axis_size = 0;
|
||||
uint32_t rank_in = 0;
|
||||
uint32_t rank_out = 0;
|
||||
vsi_bool ret = FALSE;
|
||||
|
||||
if ( !self->graph->ctx->config.support_stream_processor ||
|
||||
(type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
|
||||
|
|
@ -191,22 +183,15 @@ static vsi_bool _check_is_sp_supported_type
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
for (i = 0; i < axes_num; i++)
|
||||
if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2)) ||
|
||||
(axes_num == 2 && ((axes[0] < 2 && axes[1] < 2) || (axes[0] == 1 && axes[1] == 2))) )
|
||||
{
|
||||
shapes[0][i] = input->attr.size[axes[i]];
|
||||
shapes[1][i] = 1;
|
||||
axis_in[i] = i;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
ret = vsi_nn_kernel_optimize_reduce_shape(
|
||||
shapes[0], axes_num,
|
||||
axis_in, axes_num,
|
||||
shapes[1], axes_num,
|
||||
shapes[2], &rank_in, shapes[3], &rank_out,
|
||||
new_axis, &axis_size);
|
||||
|
||||
return ret && axis_size < 3;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
static vsi_status op_compute
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
|
|
@ -839,82 +824,28 @@ static vsi_bool op_set_sp_reduce_internal
|
|||
vsi_enum type_name
|
||||
)
|
||||
{
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_internal_tensor_t* tensor1 = NULL;
|
||||
vsi_nn_tensor_t* new_output = NULL;
|
||||
uint32_t* permute_in_perm = NULL;
|
||||
int32_t * new_axis = NULL;
|
||||
vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = {1};
|
||||
int32_t use_virtual_tensor = TRUE;
|
||||
vsi_nn_internal_node_t* tmp_inode = NULL;
|
||||
int32_t * axes = self->nn_param.reduce.local2->axes;
|
||||
int32_t axes_num = self->nn_param.reduce.local2->axes_num;
|
||||
int32_t i = 0, j = 0, index = 0;
|
||||
vsi_size_t reduce_size = 1;
|
||||
vsi_bool ret = FALSE;
|
||||
int32_t i = 0;
|
||||
|
||||
vsi_nn_internal_init_node_wksp( self );
|
||||
|
||||
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
memcpy(&attr.dtype, &inputs[0]->attr.dtype, sizeof(vsi_nn_dtype_t));
|
||||
attr.dim_num = VSI_NN_DIM_AUTO;
|
||||
attr.vtl = use_virtual_tensor;
|
||||
attr.is_const = FALSE;
|
||||
tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
|
||||
CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
|
||||
|
||||
tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 );
|
||||
tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
|
||||
CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
|
||||
permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode,
|
||||
inputs[0]->attr.dim_num * sizeof(uint32_t));
|
||||
CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create buffer failed", final);
|
||||
tmp_inode->inputs[0] = inputs[0];
|
||||
tmp_inode->outputs[0] = outputs[0];
|
||||
tmp_inode->node->nn_param.reduce_mean_internal.axis = axes;
|
||||
tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num;
|
||||
tmp_inode->node->nn_param.reduce_mean_internal.type = type_name;
|
||||
|
||||
for ( i = 0; i < axes_num; i++)
|
||||
for (i = 0; i < axes_num; i++)
|
||||
{
|
||||
shapes[index] = outputs[0]->attr.size[axes[i]];
|
||||
permute_in_perm[index ++] = axes[i];
|
||||
reduce_size *= inputs[0]->attr.size[axes[i]];
|
||||
}
|
||||
|
||||
for ( j = 0; j < (int32_t)inputs[0]->attr.dim_num; j++)
|
||||
{
|
||||
for (i = 0; i < axes_num; i++)
|
||||
{
|
||||
if (j == axes[i])
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i == axes_num)
|
||||
{
|
||||
shapes[index] = outputs[0]->attr.size[j];
|
||||
permute_in_perm[index ++] = j;
|
||||
}
|
||||
}
|
||||
tmp_inode->node->nn_param.permute.perm = permute_in_perm;
|
||||
tmp_inode->node->nn_param.permute.dim_num = inputs[0]->attr.dim_num;
|
||||
tmp_inode->inputs[0] = inputs[0];
|
||||
tmp_inode->outputs[0] = tensor1->t;
|
||||
vsi_nn_internal_setup_node(self, tmp_inode);
|
||||
|
||||
new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes, outputs[0]->attr.dim_num);
|
||||
CHECK_PTR_FAIL_GOTO(new_output, "Create tensor failed", final);
|
||||
self->nn_param.reduce.local2->reshaped_output = new_output;
|
||||
|
||||
tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
|
||||
CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
|
||||
new_axis = (int32_t *)vsi_nn_internal_new_node_param(tmp_inode,
|
||||
axes_num * sizeof(int32_t));
|
||||
CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(new_axis, tmp_inode, "Create buffer failed", final);
|
||||
for (i = 0; i < axes_num; i++)
|
||||
{
|
||||
new_axis[i] = i;
|
||||
}
|
||||
tmp_inode->inputs[0] = tensor1->t;
|
||||
tmp_inode->outputs[0] = new_output;
|
||||
tmp_inode->node->nn_param.reduce_mean_internal.axis = new_axis;
|
||||
tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num;
|
||||
tmp_inode->node->nn_param.reduce_mean_internal.type = type_name;
|
||||
if (type_name == VSI_NN_REDUCE_SUM)
|
||||
{
|
||||
tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f;
|
||||
|
|
@ -1147,6 +1078,7 @@ static vsi_bool op_set_reduce_internal
|
|||
re_sizes[axes[2]] = 1;
|
||||
new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num);
|
||||
}
|
||||
self->nn_param.reduce.local2->reshaped_output = new_output;
|
||||
|
||||
curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
|
||||
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
||||
|
|
@ -1161,7 +1093,6 @@ static vsi_bool op_set_reduce_internal
|
|||
curr->inputs[0] = tmp_output_tensor[1]->t;
|
||||
}
|
||||
curr->outputs[0] = new_output;
|
||||
self->nn_param.reduce.local2->reshaped_output = new_output;
|
||||
vsi_nn_internal_setup_node(self, curr);
|
||||
}
|
||||
else
|
||||
|
|
|
|||
|
|
@ -136,7 +136,7 @@ static vsi_bool op_setup
|
|||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
|
||||
reducesum_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
|
||||
CHECK_PTR_FAIL_GOTO(square_tensor, "Create internal tensor failed", final);
|
||||
CHECK_PTR_FAIL_GOTO(reducesum_tensor, "Create internal tensor failed", final);
|
||||
reducesum_node = vsi_nn_internal_new_node( self, VSI_NN_OP_REDUCE, 0, 0);
|
||||
CHECK_PTR_FAIL_GOTO(reducesum_node, "Create internal node failed", final);
|
||||
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@
|
|||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_internal_node.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util_prv.h"
|
||||
|
||||
|
||||
static vsi_status op_compute
|
||||
|
|
@ -88,7 +89,6 @@ static vsi_bool op_setup
|
|||
float alpha = 0;
|
||||
float max_value = 0;
|
||||
float threshold = 0;
|
||||
uint32_t max_raw = 0;
|
||||
vsi_bool ret = FALSE;
|
||||
|
||||
if ( NULL == self )
|
||||
|
|
@ -101,11 +101,9 @@ static vsi_bool op_setup
|
|||
max_value = p->max_value;
|
||||
threshold = p->threshold;
|
||||
|
||||
max_raw = *(uint32_t*)&max_value;
|
||||
|
||||
vsi_nn_internal_init_node_wksp(self);
|
||||
|
||||
if (alpha == 0 && max_raw == VSI_NN_FLOAT32_INF && threshold == 0)
|
||||
if (alpha == 0.0f && fp32_is_inf(max_value) && threshold == 0.0f)
|
||||
{
|
||||
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0);
|
||||
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
||||
|
|
@ -119,14 +117,14 @@ static vsi_bool op_setup
|
|||
curr->inputs[0] = inputs[0];
|
||||
curr->outputs[0] = outputs[0];
|
||||
}
|
||||
else if (alpha == 0 && max_value == 6.0f && threshold == 0)
|
||||
else if (alpha == 0.0f && max_value == 6.0f && threshold == 0.0f)
|
||||
{
|
||||
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0);
|
||||
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
||||
curr->inputs[0] = inputs[0];
|
||||
curr->outputs[0] = outputs[0];
|
||||
}
|
||||
else if (alpha == 0.1 && max_value == VSI_NN_FLOAT32_INF && threshold == 0)
|
||||
else if (alpha == 0.1f && max_value == VSI_NN_FLOAT32_INF && threshold == 0.0f)
|
||||
{
|
||||
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LEAKY_RELU, 0, 0);
|
||||
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
||||
|
|
|
|||
|
|
@ -70,7 +70,11 @@ static vsi_status op_compute
|
|||
self->graph,
|
||||
(uint8_t *)self->nn_param.reshape.size,
|
||||
&attr);
|
||||
|
||||
if (NULL == dims_tensor)
|
||||
{
|
||||
VSILOGE( "Create tensor fail." );
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
reshape_param.dims = REQUIRED_IO(dims_tensor);
|
||||
|
||||
self->n = vxTensorReshapeNode(self->graph->g,
|
||||
|
|
|
|||
|
|
@ -75,6 +75,11 @@ static vsi_status op_compute
|
|||
self->graph,
|
||||
(uint8_t *)dims_data,
|
||||
&attr);
|
||||
if (NULL == dims_tensor)
|
||||
{
|
||||
VSILOGE( "Create tensor fail." );
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
reshape_param.dims = REQUIRED_IO(dims_tensor);
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ static vsi_status op_compute
|
|||
vsi_status status = VSI_FAILURE;
|
||||
int32_t align_corners = self->nn_param.resize_internal.align_corners;
|
||||
int32_t half_pixel_centers = self->nn_param.resize_internal.half_pixel_centers;
|
||||
int32_t type = self->nn_param.resize_internal.type;
|
||||
vsi_enum layout = self->nn_param.resize_internal.layout;
|
||||
vsi_nn_kernel_param_t * param = NULL;
|
||||
|
||||
|
|
@ -59,6 +60,7 @@ static vsi_status op_compute
|
|||
|
||||
vsi_nn_kernel_param_add_int32( param, "align_corners", align_corners );
|
||||
vsi_nn_kernel_param_add_int32( param, "half_pixel_centers", half_pixel_centers );
|
||||
vsi_nn_kernel_param_add_int32( param, "type", type );
|
||||
|
||||
if (layout == VSI_NN_RESIZE_LAYOUT_NCHW)
|
||||
{
|
||||
|
|
@ -186,6 +188,7 @@ static vsi_status op_init
|
|||
vsi_status status = VSI_SUCCESS;
|
||||
|
||||
self->nn_param.resize_internal.layout = VSI_NN_RESIZE_LAYOUT_NCHW;
|
||||
self->nn_param.resize_internal.type = VSI_NN_INTERPOLATION_BILINEAR;
|
||||
|
||||
return status;
|
||||
} /* op_init() */
|
||||
|
|
|
|||
|
|
@ -0,0 +1,202 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "utils/vsi_nn_math.h"
|
||||
#include "vsi_nn_ops.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_tensor_util_prv.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_constraint_check.h"
|
||||
#include "vsi_nn_error.h"
|
||||
|
||||
/*
|
||||
Declare number of input and output.
|
||||
*/
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (1)
|
||||
|
||||
static vsi_status op_compute
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_param_t* param = NULL;
|
||||
vsi_nn_kernel_node_t n = NULL;
|
||||
float eps = self->nn_param.rmsnorm.eps;
|
||||
int32_t axis = self->nn_param.rmsnorm.axis;
|
||||
|
||||
param = vsi_nn_kernel_param_create();
|
||||
|
||||
vsi_nn_kernel_param_add_float32(param, "eps", eps);
|
||||
vsi_nn_kernel_param_add_int32(param, "axis", axis);
|
||||
n = vsi_nn_kernel_selector(self->graph, "rms_norm",
|
||||
inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param);
|
||||
if (n != NULL)
|
||||
{
|
||||
self->n = (vx_node)n;
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
if (param != NULL)
|
||||
{
|
||||
vsi_nn_kernel_param_release(¶m);
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* op_compute() */
|
||||
|
||||
static vsi_bool op_check
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
|
||||
|
||||
if (!ret)
|
||||
{
|
||||
BEGIN_IO_TYPE_DECL(RMS_NORM, 2, 1)
|
||||
IO_TYPE(D_F32, D_F32, D_F32)
|
||||
IO_TYPE(D_F16, D_F32, D_F16)
|
||||
IO_TYPE(D_F16, D_F32, D_F16)
|
||||
IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
|
||||
IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
|
||||
IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
|
||||
IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
|
||||
IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
|
||||
IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
|
||||
IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
|
||||
IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
|
||||
IO_TYPE(D_BF16, D_F32, D_BF16)
|
||||
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
|
||||
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
|
||||
IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
|
||||
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
|
||||
IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
|
||||
IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
|
||||
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
|
||||
IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
|
||||
IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
|
||||
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
|
||||
IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
|
||||
IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
|
||||
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
|
||||
IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
|
||||
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
|
||||
IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
|
||||
IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
|
||||
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
|
||||
IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
|
||||
IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
|
||||
IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
|
||||
IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
|
||||
IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
|
||||
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
|
||||
IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
|
||||
IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
|
||||
IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
|
||||
IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
|
||||
END_IO_TYPE_DECL(RMS_NORM)
|
||||
if (!VALIDATE_OP_IO_TYPES(RMS_NORM, self, inputs, self->input.num, outputs, self->output.num))
|
||||
{
|
||||
char* desc = generate_op_io_types_desc(inputs,
|
||||
self->input.num, outputs, self->output.num);
|
||||
VSILOGE("Inputs/Outputs data type not support: %s", desc);
|
||||
destroy_op_io_types_desc(desc);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
} /* op_check() */
|
||||
|
||||
static vsi_bool op_setup
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
vsi_bool ret = TRUE;
|
||||
|
||||
if (NULL == self)
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
ret = vsi_nn_op_common_setup(self, inputs, outputs);
|
||||
|
||||
return ret;
|
||||
} /* op_setup() */
|
||||
|
||||
static vsi_status op_init
|
||||
(
|
||||
vsi_nn_node_t* self
|
||||
)
|
||||
{
|
||||
self->nn_param.rmsnorm.axis = 0;
|
||||
self->nn_param.rmsnorm.eps = 1e-8f;
|
||||
return VSI_SUCCESS;
|
||||
} /* op_init() */
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/* Registrar */
|
||||
DEF_OP_REG
|
||||
(
|
||||
/* op_name */ RMSNORM,
|
||||
/* init */ op_init,
|
||||
/* compute */ op_compute,
|
||||
/* deinit */ vsi_nn_op_common_deinit,
|
||||
/* check */ op_check,
|
||||
/* setup */ op_setup,
|
||||
/* optimize */ NULL,
|
||||
/* input_num */ _INPUT_NUM,
|
||||
/* output_num */ _OUTPUT_NUM
|
||||
);
|
||||
|
||||
__END_DECLS
|
||||
|
||||
|
|
@ -0,0 +1,196 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_ops.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_test.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
|
||||
typedef struct _shape_local_data_t {
|
||||
vsi_nn_tensor_t *shape_tensor;
|
||||
} shape_local_data_t;
|
||||
|
||||
/*
|
||||
Declare number of input and output.
|
||||
*/
|
||||
#define _INPUT_NUM (1)
|
||||
#define _OUTPUT_NUM (1)
|
||||
|
||||
static vsi_status op_compute
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
VSI_UNREFERENCED(inputs);
|
||||
VSI_UNREFERENCED(outputs);
|
||||
return vsi_nn_internal_compute_node( self );
|
||||
} /* op_compute() */
|
||||
|
||||
static vsi_bool op_check
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
VSI_UNREFERENCED(self);
|
||||
VSI_UNREFERENCED(inputs);
|
||||
|
||||
if (outputs[0]->attr.dtype.vx_type != VSI_NN_TYPE_INT32)
|
||||
{
|
||||
VSILOGD("Outputs data type not support");
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
} /* op_check() */
|
||||
|
||||
static vsi_status op_optimize
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
vsi_nn_opt_direction_e direction
|
||||
)
|
||||
{
|
||||
VSI_UNREFERENCED(inputs);
|
||||
VSI_UNREFERENCED(outputs);
|
||||
return vsi_nn_internal_optimize_node( self, direction );
|
||||
} /* op_optimize() */
|
||||
|
||||
static vsi_bool op_setup
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
vsi_bool ret = TRUE;
|
||||
vsi_nn_internal_node_t* curr = NULL;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vx_int32 shapes[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
uint32_t i = 0;
|
||||
|
||||
for ( i = 0; i < inputs[0]->attr.dim_num; i++ )
|
||||
{
|
||||
shapes[i] = (int32_t)inputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
memset(&attr, 0, sizeof(attr));
|
||||
attr.size[0] = inputs[0]->attr.dim_num;
|
||||
attr.dim_num = 1;
|
||||
attr.is_const = TRUE;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
|
||||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||
self->nn_param.shape.local->shape_tensor = vsi_nn_CreateTensorFromData(
|
||||
self->graph,
|
||||
(uint8_t *)shapes,
|
||||
&attr);
|
||||
if ( NULL == self->nn_param.shape.local->shape_tensor )
|
||||
{
|
||||
VSILOGE("Create shape_tensor fail.(shape)");
|
||||
goto final;
|
||||
}
|
||||
|
||||
vsi_nn_internal_init_node_wksp(self);
|
||||
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
|
||||
if (NULL == curr)
|
||||
{
|
||||
return ret;
|
||||
}
|
||||
curr->inputs[0] = self->nn_param.shape.local->shape_tensor;
|
||||
curr->outputs[0] = outputs[0];
|
||||
|
||||
ret = vsi_nn_internal_setup_node(self, curr);
|
||||
|
||||
final:
|
||||
|
||||
return ret;
|
||||
} /* op_setup() */
|
||||
|
||||
static vsi_status op_init
|
||||
(
|
||||
vsi_nn_node_t* self
|
||||
)
|
||||
{
|
||||
self->nn_param.shape.local =
|
||||
(shape_local_data_t *)malloc(sizeof(shape_local_data_t));
|
||||
if (NULL == self->nn_param.shape.local)
|
||||
{
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
memset( self->nn_param.shape.local, 0, sizeof(shape_local_data_t) );
|
||||
|
||||
return VSI_SUCCESS;
|
||||
} /* op_init() */
|
||||
|
||||
static vsi_status op_deinit
|
||||
(
|
||||
vsi_nn_node_t * self
|
||||
)
|
||||
{
|
||||
if (self->nn_param.shape.local)
|
||||
{
|
||||
vsi_safe_release_tensor(self->nn_param.shape.local->shape_tensor);
|
||||
}
|
||||
vsi_nn_safe_free(self->nn_param.shape.local);
|
||||
vsi_nn_internal_deinit_node_wksp(self);
|
||||
vsi_nn_op_common_deinit(self);
|
||||
|
||||
return VSI_SUCCESS;
|
||||
} /* op_deinit() */
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/* Registrar */
|
||||
DEF_OP_REG
|
||||
(
|
||||
/* op_name */ SHAPE,
|
||||
/* init */ op_init,
|
||||
/* compute */ op_compute,
|
||||
/* deinit */ op_deinit,
|
||||
/* check */ op_check,
|
||||
/* setup */ op_setup,
|
||||
/* optimize */ op_optimize,
|
||||
/* input_num */ _INPUT_NUM,
|
||||
/* output_num */ _OUTPUT_NUM
|
||||
);
|
||||
|
||||
__END_DECLS
|
||||
|
||||
|
|
@ -735,6 +735,15 @@ static vsi_bool op_setup
|
|||
|
||||
outputs[0]->attr.dim_num++;
|
||||
}
|
||||
|
||||
/*output dim_num is 0, the tensor should be scalar!*/
|
||||
if (outputs[0]->attr.dim_num == 0)
|
||||
{
|
||||
outputs[0]->attr.dim_num = 1;
|
||||
outputs[0]->attr.size[0] = 1;
|
||||
|
||||
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
||||
}
|
||||
}
|
||||
|
||||
_get_stride_slice_start_stop_stride(self, inputs, outputs);
|
||||
|
|
|
|||
|
|
@ -124,9 +124,6 @@ static vsi_status op_compute
|
|||
outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
|
||||
shapes[1], &rank_out, &new_axis1);
|
||||
|
||||
param = vsi_nn_kernel_param_create();
|
||||
vsi_nn_kernel_param_add_int32( param, "top_k", top_k );
|
||||
|
||||
if (ret)
|
||||
{
|
||||
uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
|
|
@ -195,10 +192,14 @@ static vsi_status op_compute
|
|||
outputs_tensor[1] = reshape_tensors[2];
|
||||
}
|
||||
|
||||
param = vsi_nn_kernel_param_create();
|
||||
vsi_nn_kernel_param_add_int32( param, "top_k", top_k );
|
||||
|
||||
self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk",
|
||||
&input_tensor, _INPUT_NUM,
|
||||
outputs_tensor, _OUTPUT_NUM, param );
|
||||
|
||||
vsi_nn_kernel_param_release( ¶m );
|
||||
if (axis != 0)
|
||||
{
|
||||
_create_permute_node(self, outputs_tensor[0], reshape_tensors[1], perm_out, rank_in, TRUE);
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <float.h>
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_log.h"
|
||||
|
|
@ -47,8 +48,6 @@ typedef struct _upsamplescale_local_data_t {
|
|||
#define _INPUT_NUM (1)
|
||||
#define _OUTPUT_NUM (1)
|
||||
|
||||
#define _EPSILON 1e-8
|
||||
|
||||
static vsi_status op_compute
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
|
|
@ -69,7 +68,7 @@ static vsi_status op_compute
|
|||
stride = self->nn_param.upsamplescale.stride;
|
||||
scale = self->nn_param.upsamplescale.scale;
|
||||
|
||||
if (stride == 1 || vsi_nn_abs(scale - 1.0f) == _EPSILON)
|
||||
if (stride == 1 || vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
|
||||
{
|
||||
return vsi_nn_internal_compute_node( self );
|
||||
}
|
||||
|
|
@ -148,7 +147,7 @@ static vsi_status op_optimize
|
|||
VSI_UNREFERENCED(inputs);
|
||||
VSI_UNREFERENCED(outputs);
|
||||
|
||||
if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
|
||||
if (stride == 1 && vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
|
||||
{
|
||||
return vsi_nn_internal_optimize_node( self, direction );
|
||||
}
|
||||
|
|
@ -174,7 +173,7 @@ static vsi_bool op_setup
|
|||
|
||||
vsi_nn_internal_init_node_wksp(self);
|
||||
|
||||
if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
|
||||
if (stride == 1 && vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
|
||||
{
|
||||
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
|
||||
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
||||
|
|
@ -194,7 +193,7 @@ static vsi_bool op_setup
|
|||
|
||||
ret = vsi_nn_internal_setup_node(self, curr);
|
||||
}
|
||||
else if (vsi_nn_abs(scale - 1.0f) == _EPSILON)
|
||||
else if (vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
|
||||
{
|
||||
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESIZE, 0, 0);
|
||||
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
||||
|
|
|
|||
|
|
@ -469,6 +469,8 @@ static _op_param_gen_t s_op_gen[] =
|
|||
/* REDUCEL2 */ NULL,
|
||||
/* CROP_AND_RESIZE */ NULL,
|
||||
/* TAN */ NULL,
|
||||
/* RMSNORM */ NULL,
|
||||
/* SHAPE */ NULL,
|
||||
};
|
||||
_compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
|
||||
|
||||
|
|
|
|||
|
|
@ -46,6 +46,11 @@
|
|||
#include "utils/vsi_nn_math.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "utils/vsi_nn_dtype_util_prv.h"
|
||||
|
||||
#if (defined(__ANDROID__)) && (__ANDROID_API__ > 21)
|
||||
#include <sys/system_properties.h>
|
||||
#endif
|
||||
|
||||
typedef struct _vx_status_desc_t
|
||||
{
|
||||
|
|
@ -387,10 +392,11 @@ float vsi_nn_DataAsFloat32
|
|||
)
|
||||
{
|
||||
float val;
|
||||
uint32_t *p = (uint32_t*)(&val);
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
int16_t fp16;
|
||||
|
||||
*p = 0xFFFFFFFF;
|
||||
fp32_bit_cast.data = 0xFFFFFFFF;
|
||||
val = fp32_bit_cast.val;
|
||||
switch( type )
|
||||
{
|
||||
case VSI_NN_TYPE_BOOL8:
|
||||
|
|
@ -1462,11 +1468,15 @@ void vsi_nn_get_tensor_clamp_min_max
|
|||
}
|
||||
else
|
||||
{
|
||||
uint32_t f32_min = 0xff800000;
|
||||
uint32_t f32_max = 0x7f800000;
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
float pos_infinity;
|
||||
float neg_infinity;
|
||||
fp32_bit_cast.data = VSI_NN_FLOAT32_INF;
|
||||
pos_infinity = fp32_bit_cast.val;
|
||||
neg_infinity = -pos_infinity;
|
||||
|
||||
*clampMin = *(float*)&f32_min;
|
||||
*clampMax = *(float*)&f32_max;
|
||||
*clampMin = neg_infinity;
|
||||
*clampMax = pos_infinity;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ final:
|
|||
return status;
|
||||
}
|
||||
|
||||
#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30)
|
||||
#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
|
||||
static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER";
|
||||
static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK";
|
||||
static const char* ENV_ENABLE_CONCAT_OPTIMIZE = "vendor.VSI_NN_ENABLE_CONCAT_OPTIMIZE";
|
||||
|
|
|
|||
|
|
@ -799,6 +799,7 @@ static vsi_status batchInference_graph
|
|||
original_inputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io);
|
||||
original_outputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io);
|
||||
approximateConstTensor = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t) * graph->tensor_num);
|
||||
CHECK_PTR_FAIL_GOTO(approximateConstTensor, "Malloc fail.", final);
|
||||
memset(approximateConstTensor, -1, sizeof(vsi_nn_tensor_id_t) * graph->tensor_num);
|
||||
|
||||
if (NULL == inputs || NULL == outputs || NULL == original_inputs_attr || NULL == original_outputs_attr)
|
||||
|
|
@ -878,6 +879,7 @@ static vsi_status batchInference_graph
|
|||
vsi_size_t iterator_list_index = 0;
|
||||
vsi_size_t list_index = 0;
|
||||
vsi_size_t* iterator_list = (vsi_size_t*)malloc(sizeof(vsi_size_t) * (batchNum + 1));
|
||||
CHECK_PTR_FAIL_GOTO(iterator_list, "Malloc fail.", final);
|
||||
memset(iterator_list, 0, sizeof(uint32_t) * (batchNum + 1));
|
||||
|
||||
if (((vsi_nn_node_prv_t*)node)->split_num > 0)
|
||||
|
|
@ -885,6 +887,7 @@ static vsi_status batchInference_graph
|
|||
iterator_list[iterator_list_index++] = ((vsi_nn_node_prv_t*)node)->split_num;
|
||||
if (((vsi_nn_node_prv_t*)node)->split_num == 1)
|
||||
{/*if user set split_num = 1, there is no need to batch split.*/
|
||||
vsi_nn_safe_free(iterator_list);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
|
@ -1015,6 +1018,7 @@ static vsi_status batchInference_graph
|
|||
}
|
||||
}
|
||||
|
||||
vsi_nn_safe_free(iterator_list);
|
||||
/*restore node input batch number*/
|
||||
num_of_node_inputs = node->input.num;
|
||||
for (k = 0; k < num_of_node_inputs; k++)
|
||||
|
|
@ -1053,7 +1057,7 @@ static vsi_status batchInference_graph
|
|||
}
|
||||
}
|
||||
|
||||
final:
|
||||
final:
|
||||
for (i = 0; i < graph->node_num; i++)
|
||||
{
|
||||
node_id = nodes_list[i];
|
||||
|
|
@ -1067,7 +1071,7 @@ static vsi_status batchInference_graph
|
|||
node->input.num, inputs);
|
||||
vsi_nn_GetTensors(graph, node->output.tensors,
|
||||
node->output.num, outputs);
|
||||
for (j = 0; j < node->output.num; j++)
|
||||
for (j = 0; outputs && j < node->output.num; j++)
|
||||
{
|
||||
if (outputs[j] == NULL)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -716,6 +716,12 @@ vsi_status vsi_nn_internal_optimize_node
|
|||
for ( i = n - 1; i >= 0; i-- )
|
||||
{
|
||||
curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)WKSP(node), i);
|
||||
if ( NULL == curr )
|
||||
{
|
||||
VSILOGE("get point fail");
|
||||
status = VSI_FAILURE;
|
||||
break;
|
||||
}
|
||||
VSILOGD("Optimize backward for node uid[%u] sub_uid[%u] op[%s]",
|
||||
node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op));
|
||||
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@
|
|||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30)
|
||||
#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
|
||||
static const char* ENV_LOG_LEVEL = "vendor.VSI_NN_LOG_LEVEL";
|
||||
#else
|
||||
static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL";
|
||||
|
|
|
|||
|
|
@ -1013,6 +1013,7 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParamForCropOnly
|
|||
sizeof(numParams));
|
||||
if (VSI_SUCCESS != status)
|
||||
{
|
||||
vsi_nn_safe_free(nodes);
|
||||
goto final;
|
||||
}
|
||||
for (p = 0; p < numParams; p++)
|
||||
|
|
|
|||
|
|
@ -3258,6 +3258,7 @@ static vsi_bool _init_dummy_tensor
|
|||
#endif
|
||||
// This is a hack that driver doesn't support const scales
|
||||
scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim);
|
||||
CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
|
||||
memcpy(scales,
|
||||
tensor->attr.dtype.scales,
|
||||
tensor->attr.dtype.scale_dim * sizeof(float));
|
||||
|
|
|
|||
Loading…
Reference in New Issue