Update internal for 22Q3 release
update internal to commit-id: e2b0fde631fce349e0e3ad42b2a4d40ce7634a97 Type: Code Improvement Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
parent
7baf8c307f
commit
ed162d0176
|
|
@ -179,3 +179,7 @@ DEF_OP(SOFTSIGN)
|
|||
DEF_OP(CUMSUM)
|
||||
DEF_OP(MAXPOOLWITHARGMAX)
|
||||
DEF_OP(MOD)
|
||||
DEF_OP(LPPOOL)
|
||||
DEF_OP(SCATTER_ELEMENTS)
|
||||
DEF_OP(PRE_PROCESS_YUV422)
|
||||
DEF_OP(BUCKETIZE)
|
||||
|
|
|
|||
|
|
@ -326,9 +326,20 @@ const void * vsi_nn_kernel_param_get_const_buffer
|
|||
} \
|
||||
static vsi_status NAME##_impl
|
||||
|
||||
#define DEF_SP_KERNEL_BASE_CALLBACK( NAME ) \
|
||||
static vsi_status NAME##_impl( vsi_nn_kernel_node_t node); \
|
||||
static vx_status VX_CALLBACK NAME( \
|
||||
vx_node node) {\
|
||||
return (vx_status)NAME##_impl( \
|
||||
(vsi_nn_kernel_node_t)node); \
|
||||
} \
|
||||
static vsi_status NAME##_impl
|
||||
|
||||
|
||||
#define DEF_KERNEL_INITIALIZER( NAME ) DEF_KERNEL_BASE_CALLBACK( NAME )
|
||||
#define DEF_KERNEL_EXECUTOR( NAME ) DEF_KERNEL_BASE_CALLBACK( NAME )
|
||||
#define DEF_KERNEL_DEINITIALIZER( NAME ) DEF_KERNEL_BASE_CALLBACK( NAME )
|
||||
#define DEF_SP_KERNEL_QUERY( NAME ) DEF_SP_KERNEL_BASE_CALLBACK( NAME )
|
||||
|
||||
void vsi_nn_kernel_backend_register
|
||||
(
|
||||
|
|
|
|||
|
|
@ -85,4 +85,10 @@ vsi_bool vsi_nn_kernel_optimize_group_norm_shape
|
|||
int32_t is_sp_kernel, vsi_size_t* out_shape
|
||||
);
|
||||
|
||||
vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape
|
||||
(
|
||||
const vsi_size_t* shape_x, const vsi_size_t rank_x, const int32_t axis,
|
||||
vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, vsi_size_t max_size
|
||||
);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -48,6 +48,10 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
|
|||
VSI_NN_KERNEL_LUT_CELU = 14,
|
||||
VSI_NN_KERNEL_LUT_RCP = 15,
|
||||
VSI_NN_KERNEL_LUT_SOFTSIGN = 16,
|
||||
VSI_NN_KERNEL_LUT_LINEAR_EXP = 17,
|
||||
VSI_NN_KERNEL_LUT_LINEAR_RSQRT = 18,
|
||||
VSI_NN_KERNEL_LUT_LINEAR_SIGMOID = 19,
|
||||
|
||||
};
|
||||
|
||||
#define VSI_NN_KERNEL_LUT_MAX_SIZE (1024)
|
||||
|
|
|
|||
|
|
@ -38,22 +38,26 @@ enum
|
|||
|
||||
BI_RNN_FW_INPUT_WEIGHT_I = 1,
|
||||
BI_RNN_FW_INPUT_WEIGHT_H = 2,
|
||||
BI_RNN_FW_INPUT_BIAS = 3,
|
||||
BI_RNN_FW_INPUT_H_STATE = 4,
|
||||
BI_RNN_FW_INPUT_BIAS_I = 3,
|
||||
BI_RNN_FW_INPUT_BIAS_H = 4,
|
||||
BI_RNN_FW_INPUT_H_STATE = 5,
|
||||
|
||||
BI_RNN_BW_INPUT_WEIGHT_I = 5,
|
||||
BI_RNN_BW_INPUT_WEIGHT_H = 6,
|
||||
BI_RNN_BW_INPUT_BIAS = 7,
|
||||
BI_RNN_BW_INPUT_H_STATE = 8,
|
||||
BI_RNN_BW_INPUT_WEIGHT_I = 6,
|
||||
BI_RNN_BW_INPUT_WEIGHT_H = 7,
|
||||
BI_RNN_BW_INPUT_BIAS_I = 8,
|
||||
BI_RNN_BW_INPUT_BIAS_H = 9,
|
||||
BI_RNN_BW_INPUT_H_STATE = 10,
|
||||
|
||||
BI_RNN_AUX_INPUT = 9,
|
||||
BI_RNN_FW_AUX_INPUT_WEIGHT = 10,
|
||||
BI_RNN_BW_AUX_INPUT_WEIGHT = 11,
|
||||
BI_RNN_AUX_INPUT = 11,
|
||||
BI_RNN_FW_AUX_INPUT_WEIGHT = 12,
|
||||
BI_RNN_BW_AUX_INPUT_WEIGHT = 13,
|
||||
|
||||
BI_RNN_INPUT_CNT,
|
||||
|
||||
BI_RNN_FW_OUTPUT_OUTPUT = 0,
|
||||
BI_RNN_BW_OUTPUT_OUTPUT = 1,
|
||||
BI_RNN_FW_OUTPUT_H_STATE = 0,
|
||||
BI_RNN_BW_OUTPUT_H_STATE = 1,
|
||||
BI_RNN_FW_OUTPUT_OUTPUT = 2,
|
||||
BI_RNN_BW_OUTPUT_OUTPUT = 3,
|
||||
BI_RNN_OUTPUT_CNT
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,48 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_BUCKETIZE_H
|
||||
#define _VSI_NN_OP_BUCKETIZE_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_bucketize_param
|
||||
{
|
||||
struct _bucketize_local_data_t* local;
|
||||
// Add parameters here
|
||||
vsi_bool right;
|
||||
} vsi_nn_bucketize_param;
|
||||
_compiler_assert(offsetof(vsi_nn_bucketize_param, local) == 0, \
|
||||
vsi_nn_bucketize_h );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -54,6 +54,7 @@ typedef struct _vsi_nn_conv1d_param
|
|||
uint32_t group;
|
||||
uint32_t dilation;
|
||||
int32_t multiplier;
|
||||
vsi_nn_pad_mode_e pad_mode;
|
||||
} vsi_nn_conv1d_param;
|
||||
_compiler_assert(offsetof(vsi_nn_conv1d_param, local) == 0, \
|
||||
vsi_nn_vsi_nn_conv1d_h );
|
||||
|
|
|
|||
|
|
@ -30,6 +30,20 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_conv2d_param_deprecate
|
||||
{
|
||||
uint32_t ksize[2];
|
||||
uint32_t stride[2];
|
||||
/* Pad left, right, top, bottom */
|
||||
uint32_t pad[4];
|
||||
/* Pad type default value shall be AUTO */
|
||||
vsi_nn_pad_e pad_type;
|
||||
uint32_t weights;
|
||||
uint32_t group;
|
||||
uint32_t dilation[2];
|
||||
int32_t multiplier;
|
||||
} vsi_nn_conv2d_param_deprecate;
|
||||
|
||||
typedef struct _vsi_nn_conv2d_param
|
||||
{
|
||||
uint32_t ksize[2];
|
||||
|
|
@ -42,6 +56,7 @@ typedef struct _vsi_nn_conv2d_param
|
|||
uint32_t group;
|
||||
uint32_t dilation[2];
|
||||
int32_t multiplier;
|
||||
vsi_nn_pad_mode_e pad_mode;
|
||||
} vsi_nn_conv2d_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ typedef struct _vsi_nn_conv3d_param
|
|||
int32_t weights;
|
||||
|
||||
int32_t multiplier;
|
||||
vsi_nn_pad_mode_e pad_mode;
|
||||
} vsi_nn_conv3d_param;
|
||||
_compiler_assert(offsetof(vsi_nn_conv3d_param, local) == 0, \
|
||||
vsi_nn_conv3d_h );
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ typedef struct _vsi_nn_deconv3d_param
|
|||
uint32_t weights;
|
||||
uint32_t group;
|
||||
uint32_t output_padding[3];
|
||||
vsi_nn_pad_mode_e pad_mode;
|
||||
} vsi_nn_deconv3d_param;
|
||||
_compiler_assert(offsetof(vsi_nn_deconv3d_param, local) == 0, \
|
||||
vsi_nn_deconv3d_h );
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ typedef struct _vsi_nn_depthwise_conv1d_param
|
|||
uint32_t pad[2];
|
||||
uint32_t dilation;
|
||||
int32_t multiplier;
|
||||
vsi_nn_pad_mode_e pad_mode;
|
||||
} vsi_nn_depthwise_conv1d_param;
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ typedef struct _vsi_nn_grouped_conv1d_param
|
|||
uint32_t group;
|
||||
uint32_t dilation;
|
||||
int32_t multiplier;
|
||||
vsi_nn_pad_mode_e pad_mode;
|
||||
} vsi_nn_grouped_conv1d_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ typedef struct _vsi_nn_grouped_conv2d_param
|
|||
uint32_t dilation[2];
|
||||
int32_t multiplier;
|
||||
void* local;
|
||||
vsi_nn_pad_mode_e pad_mode;
|
||||
} vsi_nn_grouped_conv2d_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -86,7 +86,7 @@ typedef struct _vsi_nn_l2normalizescale_lcl_data
|
|||
{
|
||||
vx_tensor local_tensor[_VSI_NN_L2NORMALIZESCALE_LOCAL_TENSOR_NUM];
|
||||
uint32_t hash_idx;
|
||||
vsi_bool execute_on_sw;
|
||||
vsi_bool use_internal_node;
|
||||
} vsi_nn_l2normalizescale_lcl_data;
|
||||
|
||||
typedef struct _vsi_nn_l2normalizescale_param
|
||||
|
|
|
|||
|
|
@ -35,14 +35,20 @@ extern "C" {
|
|||
|
||||
typedef struct _vsi_nn_layernorm_lcl_data
|
||||
{
|
||||
vx_tensor local_tensor[_VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM];
|
||||
vsi_bool use_internal_node;
|
||||
} vsi_nn_layernorm_lcl_data;
|
||||
|
||||
typedef struct _vsi_nn_layernormalize_param
|
||||
{
|
||||
/* local data must be the first. */
|
||||
vsi_nn_layernorm_lcl_data local;
|
||||
union
|
||||
{
|
||||
vx_tensor local_tensor[_VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM];
|
||||
vsi_nn_layernorm_lcl_data *local;
|
||||
};
|
||||
|
||||
float eps;
|
||||
int32_t axis;
|
||||
} vsi_nn_layernormalize_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -0,0 +1,46 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_LPPOOL_H
|
||||
#define _VSI_NN_OP_LPPOOL_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_lppool_param {
|
||||
vsi_nn_pad_e pad_type;
|
||||
uint32_t ksize[2];
|
||||
int32_t p;
|
||||
uint32_t pad[4];
|
||||
uint32_t stride[2];
|
||||
} vsi_nn_lppool_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -30,13 +30,6 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef enum {
|
||||
VSI_NN_PAD_MODE_CONSTANT,
|
||||
VSI_NN_PAD_MODE_REPLICATE,
|
||||
VSI_NN_PAD_MODE_SYMMETRIC,
|
||||
VSI_NN_PAD_MODE_REFLECT,
|
||||
}vsi_nn_pad_mode_e;
|
||||
|
||||
typedef struct _vsi_nn_pad_param
|
||||
{
|
||||
const uint32_t * front_size;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,81 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_PRE_PROCESS_YUV422_H
|
||||
#define _VSI_NN_OP_PRE_PROCESS_YUV422_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define _VSI_NN_PRE_PROCESS_YUV422_LOCAL_TENSOR_NUM 2
|
||||
|
||||
typedef struct _vsi_nn_pre_process_yuv422_lcl_data
|
||||
{
|
||||
int32_t scale_x;
|
||||
int32_t scale_y;
|
||||
vsi_bool enable_copy;
|
||||
vsi_bool enable_perm;
|
||||
vx_tensor local_tensor[_VSI_NN_PRE_PROCESS_YUV422_LOCAL_TENSOR_NUM];
|
||||
} vsi_nn_pre_process_yuv422_lcl_data;
|
||||
|
||||
typedef struct _vsi_nn_pre_process_yuv422_param
|
||||
{
|
||||
vsi_nn_pre_process_yuv422_lcl_data* local;
|
||||
|
||||
vsi_nn_yuv_type yuv422_type;
|
||||
|
||||
struct
|
||||
{
|
||||
uint32_t left;
|
||||
uint32_t top;
|
||||
uint32_t width;
|
||||
uint32_t height;
|
||||
} rect;
|
||||
|
||||
struct
|
||||
{
|
||||
vsi_size_t *size;
|
||||
uint32_t dim_num;
|
||||
} output_attr;
|
||||
|
||||
uint32_t * perm;
|
||||
uint32_t dim_num;
|
||||
|
||||
float r_mean;
|
||||
float g_mean;
|
||||
float b_mean;
|
||||
float rgb_scale;
|
||||
|
||||
vsi_bool reverse_channel;
|
||||
} vsi_nn_pre_process_yuv422_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -37,11 +37,12 @@ enum
|
|||
RNNCELL_INPUT_INPUT = 0,
|
||||
RNNCELL_INPUT_WEIGHT_I = 1,
|
||||
RNNCELL_INPUT_WEIGHT_H = 2,
|
||||
RNNCELL_INPUT_BIAS = 3,
|
||||
RNNCELL_INPUT_H_STATE = 4,
|
||||
RNNCELL_INPUT_BIAS_I = 3,
|
||||
RNNCELL_INPUT_BIAS_H = 4,
|
||||
RNNCELL_INPUT_H_STATE = 5,
|
||||
|
||||
RNNCELL_INPUT_AUX_INPUT = 5,
|
||||
RNNCELL_INPUT_AUX_WEIGHT = 6,
|
||||
RNNCELL_INPUT_AUX_INPUT = 6,
|
||||
RNNCELL_INPUT_AUX_WEIGHT = 7,
|
||||
RNNCELL_INPUT_CNT,
|
||||
|
||||
RNNCELL_OUTPUT_H_STATE = 0,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,49 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_SCATTER_ELEMENTS_H
|
||||
#define _VSI_NN_OP_SCATTER_ELEMENTS_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_scatter_elements_param
|
||||
{
|
||||
struct _scatter_elements_local_data_t* local;
|
||||
// Add parameters here
|
||||
int32_t axis;
|
||||
vsi_nn_reduction_type_e reduction;
|
||||
} vsi_nn_scatter_elements_param;
|
||||
_compiler_assert(offsetof(vsi_nn_scatter_elements_param, local) == 0, \
|
||||
vsi_nn_scatter_elements_h );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -37,11 +37,13 @@ enum
|
|||
RNN_INPUT_INPUT = 0,
|
||||
RNN_INPUT_WEIGHT_I = 1,
|
||||
RNN_INPUT_WEIGHT_H = 2,
|
||||
RNN_INPUT_BIAS = 3,
|
||||
RNN_INPUT_H_STATE = 4,
|
||||
RNN_INPUT_BIAS_I = 3,
|
||||
RNN_INPUT_BIAS_H = 4,
|
||||
RNN_INPUT_H_STATE = 5,
|
||||
RNN_INPUT_CNT,
|
||||
|
||||
RNN_OUTPUT_OUTPUT = 0,
|
||||
RNN_OUTPUT_H_STATE = 0,
|
||||
RNN_OUTPUT_OUTPUT = 1,
|
||||
RNN_OUTPUT_CNT
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -253,11 +253,11 @@ static VSI_INLINE_API int32_t fp32_to_dfp
|
|||
type_get_range( type, &max_range, &min_range );
|
||||
if( fl > 0 )
|
||||
{
|
||||
data = (int32_t)vsi_rint( in * (float)( (int64_t)1 << fl ) );
|
||||
data = (int32_t)vsi_rint( in * (double)( (int64_t)1 << fl ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
data = (int32_t)vsi_rint( in * ( 1.0f / (float)( (int64_t)1 << -fl ) ) );
|
||||
data = (int32_t)vsi_rint( in * ( 1.0f / (double)( (int64_t)1 << -fl ) ) );
|
||||
}
|
||||
data = vsi_nn_min( data, (int32_t)max_range );
|
||||
data = vsi_nn_max( data, (int32_t)min_range );
|
||||
|
|
|
|||
|
|
@ -468,6 +468,16 @@ FILE* vsi_nn_fopen
|
|||
const char * mode
|
||||
);
|
||||
|
||||
int32_t vsi_nn_get_vx_pad_mode
|
||||
(
|
||||
vsi_nn_pad_mode_e mode
|
||||
);
|
||||
|
||||
vsi_bool vsi_nn_is_3d_tensor
|
||||
(
|
||||
vsi_nn_tensor_t * tensor
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -27,6 +27,8 @@
|
|||
#include <memory>
|
||||
#include <functional>
|
||||
|
||||
#include "vsi_nn_pub.h"
|
||||
|
||||
struct _vsi_nn_graph;
|
||||
typedef struct _vsi_nn_graph vsi_nn_graph_t;
|
||||
|
||||
|
|
@ -38,13 +40,13 @@ using data_t = const void*;
|
|||
|
||||
class IDevice {
|
||||
public:
|
||||
IDevice(uint32_t id);
|
||||
~IDevice();
|
||||
uint32_t Id() const;
|
||||
bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data);
|
||||
bool GraphRemove(const vsi_nn_graph_t* graph);
|
||||
bool ThreadExit();
|
||||
void WaitThreadIdle();
|
||||
OVXLIB_API IDevice(uint32_t id);
|
||||
OVXLIB_API ~IDevice();
|
||||
OVXLIB_API uint32_t Id() const;
|
||||
OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data);
|
||||
OVXLIB_API bool GraphRemove(const vsi_nn_graph_t* graph);
|
||||
OVXLIB_API bool ThreadExit();
|
||||
OVXLIB_API void WaitThreadIdle();
|
||||
|
||||
protected:
|
||||
Device* device_;
|
||||
|
|
@ -52,4 +54,4 @@ class IDevice {
|
|||
|
||||
} // namespace vip
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -76,6 +76,7 @@ typedef struct _vsi_nn_runtime_option_t
|
|||
int32_t enable_opcheck;
|
||||
int32_t enable_concat_optimize;
|
||||
int32_t enable_asymi8_to_u8;
|
||||
int32_t enable_dataconvert_optimize;
|
||||
} vsi_nn_runtime_option_t;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -751,6 +751,20 @@ OVXLIB_API vsi_bool vsi_nn_IsGraphFastMode
|
|||
(
|
||||
const vsi_nn_graph_t* graph
|
||||
);
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_CopyTensorViaGraphs
|
||||
(
|
||||
vsi_nn_graph_t *src_graph,
|
||||
vsi_nn_tensor_id_t src_tensor_id,
|
||||
vsi_nn_graph_t *dst_graph,
|
||||
vsi_nn_tensor_id_t dst_tensor_id
|
||||
);
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_ExecuteGraphLoop
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
vsi_nn_tensor_t *max_iteration_tensor
|
||||
);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -196,6 +196,10 @@
|
|||
#include "ops/vsi_nn_op_softsign.h"
|
||||
#include "ops/vsi_nn_op_cumsum.h"
|
||||
#include "ops/vsi_nn_op_mod.h"
|
||||
#include "ops/vsi_nn_op_lppool.h"
|
||||
#include "ops/vsi_nn_op_scatter_elements.h"
|
||||
#include "ops/vsi_nn_op_pre_process_yuv422.h"
|
||||
#include "ops/vsi_nn_op_bucketize.h"
|
||||
/* custom node head define define */
|
||||
#include "custom/vsi_nn_custom_node_type.h"
|
||||
|
||||
|
|
@ -206,9 +210,10 @@ extern "C"{
|
|||
/** Operation attributes */
|
||||
typedef union _vsi_nn_nn_param
|
||||
{
|
||||
vsi_nn_conv2d_param conv2d;
|
||||
struct
|
||||
{
|
||||
vsi_nn_conv2d_param conv2d;
|
||||
vsi_nn_conv2d_param_deprecate conv2d_deprecate;
|
||||
vsi_nn_pool_param pool;
|
||||
};
|
||||
vsi_nn_fcl_param fcl;
|
||||
|
|
@ -377,6 +382,10 @@ typedef union _vsi_nn_nn_param
|
|||
vsi_nn_softsign_param softsign;
|
||||
vsi_nn_cumsum_param cumsum;
|
||||
vsi_nn_mod_param mod;
|
||||
vsi_nn_lppool_param lppool;
|
||||
vsi_nn_scatter_elements_param scatter_elements;
|
||||
vsi_nn_pre_process_yuv422_param pre_process_yuv422;
|
||||
vsi_nn_bucketize_param bucketize;
|
||||
void* client_param;
|
||||
|
||||
/* custom node data struct define */
|
||||
|
|
|
|||
|
|
@ -85,6 +85,8 @@ typedef enum
|
|||
VSI_NN_SOURCE_FORMAT_IMAGE_YUV444,
|
||||
VSI_NN_SOURCE_FORMAT_IMAGE_NV12,
|
||||
VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP,
|
||||
VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422,
|
||||
VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422,
|
||||
} vsi_nn_preprocess_source_format_e;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -111,6 +111,22 @@ typedef enum
|
|||
VSI_NN_PAD_SAME
|
||||
} vsi_nn_pad_e;
|
||||
|
||||
/** reduce type enum */
|
||||
typedef enum
|
||||
{
|
||||
VSI_NN_REDUCTION_TYPE_NONE,
|
||||
VSI_NN_REDUCTION_TYPE_ADD,
|
||||
VSI_NN_REDUCTION_TYPE_MUL
|
||||
} vsi_nn_reduction_type_e;
|
||||
|
||||
/** Pad mode enum */
|
||||
typedef enum {
|
||||
VSI_NN_PAD_MODE_CONSTANT,
|
||||
VSI_NN_PAD_MODE_REPLICATE,
|
||||
VSI_NN_PAD_MODE_SYMMETRIC,
|
||||
VSI_NN_PAD_MODE_REFLECT,
|
||||
} vsi_nn_pad_mode_e;
|
||||
|
||||
/**
|
||||
* @deprecated Platform enum
|
||||
* @see vsi_nn_dim_fmt_e
|
||||
|
|
@ -235,6 +251,12 @@ typedef enum _vsi_nn_con2d_lstm_dataformat
|
|||
CONV2D_LSTM_CHANNELS_FIRST
|
||||
} vsi_nn_con2d_lstm_dataformat;
|
||||
|
||||
typedef enum _vsi_nn_yuv_type
|
||||
{
|
||||
VSI_NN_YUV_TYPE_YUYV422,
|
||||
VSI_NN_YUV_TYPE_UYUV422
|
||||
}vsi_nn_yuv_type;
|
||||
|
||||
/** Deprecated */
|
||||
typedef uint32_t vsi_nn_size_t;
|
||||
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ extern "C"{
|
|||
|
||||
#define VSI_NN_VERSION_MAJOR 1
|
||||
#define VSI_NN_VERSION_MINOR 1
|
||||
#define VSI_NN_VERSION_PATCH 50
|
||||
#define VSI_NN_VERSION_PATCH 57
|
||||
#define VSI_NN_VERSION \
|
||||
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
|
||||
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ __BEGIN_DECLS
|
|||
VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME },
|
||||
|
||||
#define HASH_BATCH_NORM_SH_KERNEL_2D_NAME( SRC_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("batch_norm_"#SRC_TYPE"to"#DST_TYPE"_2D")
|
||||
CVIVANTE_NAMESPACE("cl.batch_norm_"#SRC_TYPE"to"#DST_TYPE"_2D")
|
||||
|
||||
#define TENSOR_BATCH_NORM_KERNELS_2D( SRC_TYPE, OUT_TYPE) \
|
||||
{ HASH_BATCH_NORM_KEY( SRC_TYPE, OUT_TYPE, 1), \
|
||||
|
|
|
|||
|
|
@ -0,0 +1,303 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_BUCKETIZE,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define STR(a) #a
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
|
||||
(( IN0_DTYPE ) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE << 16 ) | (RIGHT << 24) | (IMG_2D << 25))
|
||||
|
||||
#define PACK_KERNEL_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
|
||||
{ BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \
|
||||
CVIVANTE_NAMESPACE("cl.bucketize_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
"bucketize" }
|
||||
#define PACK_KERNEL_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
|
||||
{ BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \
|
||||
CVIVANTE_NAMESPACE("cl.bucketize_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
|
||||
"bucketize" }
|
||||
#define PACK_KERNEL_RIGHT_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
|
||||
{ BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \
|
||||
CVIVANTE_NAMESPACE("cl.bucketize_right_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
"bucketize" }
|
||||
#define PACK_KERNEL_RIGHT_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
|
||||
{ BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \
|
||||
CVIVANTE_NAMESPACE("cl.bucketize_right_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
|
||||
"bucketize" }
|
||||
|
||||
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||
PACK_KERNEL_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 0 ), \
|
||||
PACK_KERNEL_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 1 ), \
|
||||
PACK_KERNEL_RIGHT_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 0 ), \
|
||||
PACK_KERNEL_RIGHT_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 1 ),
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _bucketize_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( F32, F32, I32 )
|
||||
PACK_KERNEL_MAP( I32, I32, I32 )
|
||||
PACK_KERNEL_MAP( U32, U32, I32 )
|
||||
PACK_KERNEL_MAP( BF16, BF16, I32 )
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _bucketize_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _BUCKETIZE_PARAM_NUM _cnt_of_array( _bucketize_kernel_param_def )
|
||||
#define SCALAR_BOUNDARIES_VALUE (3)
|
||||
#define SCALAR_SCALE0_VALUE (4)
|
||||
#define SCALAR_TAIL0_VALUE (5)
|
||||
#define SCALAR_SCALE1_VALUE (6)
|
||||
#define SCALAR_TAIL1_VALUE (7)
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_bucketize_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = output_attr->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (
|
||||
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(output_attr);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
return status;
|
||||
} /* _bucketize_initializer() */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t right,
|
||||
vsi_bool is_img2d
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
vsi_nn_kernel_dtype_e in1_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _bucketize_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _bucketize_kernel_map );
|
||||
vx_param_description_t * param_def = _bucketize_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _bucketize_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
#define _PACK_SELECT_KEY( in0_dtype, in1_dtype ) \
|
||||
( ( in0_dtype ) | ( in1_dtype << 8 ))
|
||||
|
||||
switch (_PACK_SELECT_KEY(in0_dtype, in1_dtype))
|
||||
{
|
||||
case _PACK_SELECT_KEY(F32, F32):
|
||||
case _PACK_SELECT_KEY(F16, F16):
|
||||
key = BUCKETIZE_HASH_KEY( F32, F32, out_dtype, right, is_img2d );
|
||||
break;
|
||||
case _PACK_SELECT_KEY(I8, I8):
|
||||
case _PACK_SELECT_KEY(I16, I16):
|
||||
case _PACK_SELECT_KEY(I32, I32):
|
||||
key = BUCKETIZE_HASH_KEY( I32, I32, out_dtype, right, is_img2d );
|
||||
break;
|
||||
case _PACK_SELECT_KEY(U8, U8):
|
||||
case _PACK_SELECT_KEY(U16, U16):
|
||||
case _PACK_SELECT_KEY(U32, U32):
|
||||
key = BUCKETIZE_HASH_KEY( U32, U32, out_dtype, right, is_img2d );
|
||||
break;
|
||||
default:
|
||||
key = BUCKETIZE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, right, is_img2d );
|
||||
break;
|
||||
}
|
||||
#undef _PACK_SELECT_KEY
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _bucketize_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_BUCKETIZE_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
float input0_scale= vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input0_tail = -input0_scale * (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float input1_scale= vsi_nn_get_tensor_scale(inputs[1]);
|
||||
float input1_tail = -input0_scale * (float)vsi_nn_get_tensor_zero_point(inputs[1]);
|
||||
int32_t boundaries_size = (int32_t)inputs[1]->attr.size[0];
|
||||
vsi_bool image_2d = FALSE;
|
||||
int32_t right = vsi_nn_kernel_param_get_int32( params, "right" );
|
||||
|
||||
if( !vsi_nn_kernel_gpu_check_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num ) ||
|
||||
boundaries_size >= GPU_TENSOR_MAX_WIDTH )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, right, image_2d );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _BUCKETIZE_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
node_params[SCALAR_BOUNDARIES_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &boundaries_size );
|
||||
node_params[SCALAR_SCALE0_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
|
||||
node_params[SCALAR_TAIL0_VALUE] = vsi_nn_kernel_scalar_create(graph, F32, &input0_tail );
|
||||
node_params[SCALAR_SCALE1_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
|
||||
node_params[SCALAR_TAIL1_VALUE] = vsi_nn_kernel_scalar_create(graph, F32, &input1_tail );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _BUCKETIZE_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_BOUNDARIES_VALUE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE0_VALUE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL0_VALUE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE1_VALUE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL1_VALUE] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( bucketize, _setup )
|
||||
|
||||
|
|
@ -252,6 +252,16 @@ static vsi_status _query_kernel
|
|||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (input0_dtype == I8)
|
||||
{
|
||||
input0_dtype = I32;
|
||||
}
|
||||
|
||||
if (output_dtype == I8)
|
||||
{
|
||||
output_dtype = I32;
|
||||
}
|
||||
|
||||
key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0, is_batch );
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(gather_map); i ++ )
|
||||
|
|
|
|||
|
|
@ -0,0 +1,332 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_LPPOOL,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define _LPPOOL_KERNEL_SOURCE_NAME "lppool"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define LPPOOL_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
|
||||
(( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
|
||||
#define LPPOOL_KERNELS( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ LPPOOL_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
|
||||
CVIVANTE_NAMESPACE("cl.lppool_"#IN_DTYPE"to"#OUT_DTYPE), \
|
||||
_LPPOOL_KERNEL_SOURCE_NAME }, \
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _lppool_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
LPPOOL_KERNELS( F32, F32 )
|
||||
LPPOOL_KERNELS( F32, U32 )
|
||||
LPPOOL_KERNELS( F32, I32 )
|
||||
LPPOOL_KERNELS( U32, U32 )
|
||||
LPPOOL_KERNELS( U32, F32 )
|
||||
LPPOOL_KERNELS( I32, I32 )
|
||||
LPPOOL_KERNELS( I32, F32 )
|
||||
LPPOOL_KERNELS( BF16, BF16 )
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _lppool_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _LPPOOL_PARAM_NUM _cnt_of_array( _lppool_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_lppool_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_tensor output = (vx_tensor)param[1];
|
||||
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
|
||||
vsi_size_array_t *output_shape = NULL;
|
||||
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
output_shape = output_attr->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.global_size[0] = (output_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0];
|
||||
gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1];
|
||||
gpu_param.global_size[2] = (output_shape->data[2] + gpu_param.global_scale[2] - 1)
|
||||
/ gpu_param.global_scale[2];
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
if (output_attr)
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release(&output_attr);
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _lppool_initializer() */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _lppool_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _lppool_kernel_map );
|
||||
vx_param_description_t * param_def = _lppool_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _lppool_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
#define _PACK_SELECT_KEY( in_dtype, out_dtype ) \
|
||||
(( in_dtype ) | (out_dtype << 8 ))
|
||||
switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
|
||||
{
|
||||
case _PACK_SELECT_KEY(F32, F32):
|
||||
case _PACK_SELECT_KEY(F16, F16):
|
||||
case _PACK_SELECT_KEY(F32, F16):
|
||||
case _PACK_SELECT_KEY(F16, F32):
|
||||
key = LPPOOL_HASH_KEY( F32, F32);
|
||||
break;
|
||||
case _PACK_SELECT_KEY(F32, U8):
|
||||
case _PACK_SELECT_KEY(F16, U8):
|
||||
key = LPPOOL_HASH_KEY( F32, U32);
|
||||
break;
|
||||
case _PACK_SELECT_KEY(F32, I8):
|
||||
case _PACK_SELECT_KEY(F32, I16):
|
||||
case _PACK_SELECT_KEY(F16, I8):
|
||||
case _PACK_SELECT_KEY(F16, I16):
|
||||
key = LPPOOL_HASH_KEY( F32, I32);
|
||||
break;
|
||||
case _PACK_SELECT_KEY(U8, U8):
|
||||
key = LPPOOL_HASH_KEY( U32, U32);
|
||||
break;
|
||||
case _PACK_SELECT_KEY(U8, F16):
|
||||
case _PACK_SELECT_KEY(U8, F32):
|
||||
key = LPPOOL_HASH_KEY( U32, F32);
|
||||
break;
|
||||
case _PACK_SELECT_KEY(I8, I8):
|
||||
case _PACK_SELECT_KEY(I8, I16):
|
||||
case _PACK_SELECT_KEY(I16, I8):
|
||||
case _PACK_SELECT_KEY(I16, I16):
|
||||
key = LPPOOL_HASH_KEY( I32, I32);
|
||||
break;
|
||||
case _PACK_SELECT_KEY(I8, F16):
|
||||
case _PACK_SELECT_KEY(I8, F32):
|
||||
case _PACK_SELECT_KEY(I16, F16):
|
||||
case _PACK_SELECT_KEY(I16, F32):
|
||||
key = LPPOOL_HASH_KEY( I32, F32);
|
||||
break;
|
||||
default:
|
||||
key = LPPOOL_HASH_KEY( in_dtype, out_dtype);
|
||||
break;
|
||||
}
|
||||
#undef _PACK_SELECT_KEY
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _lppool_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_LPPOOL_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x");
|
||||
int32_t ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y");
|
||||
int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x");
|
||||
int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y");
|
||||
int32_t pad_left = vsi_nn_kernel_param_get_int32(params, "pad_left");
|
||||
int32_t pad_top = vsi_nn_kernel_param_get_int32(params, "pad_top");
|
||||
int32_t p = vsi_nn_kernel_param_get_int32(params, "p");
|
||||
int32_t width = (int32_t)inputs[0]->attr.size[0];
|
||||
int32_t height = (int32_t)inputs[0]->attr.size[1];
|
||||
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num )
|
||||
|| !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
outputScale = 1.0f / outputScale;
|
||||
inputTail = -(inputTail * inputScale);
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
uint32_t index = 2;
|
||||
vsi_nn_kernel_node_pack_io( node_params, _LPPOOL_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &p );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputTail );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _LPPOOL_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[11] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[12] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[13] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[14] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( lppool, _setup )
|
||||
|
||||
|
|
@ -250,7 +250,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
|
||||
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
|
||||
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
|
||||
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
|
||||
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
|
||||
|
||||
|
|
|
|||
|
|
@ -250,7 +250,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
|
||||
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
|
||||
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
|
||||
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
|
||||
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
|
||||
|
||||
|
|
|
|||
|
|
@ -87,6 +87,7 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
|
|||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _ROI_ALIGN_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def )
|
||||
|
||||
|
|
@ -103,8 +104,9 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
|
|||
#define SCALAR_SAMPLING_X_RATIO (14)
|
||||
#define SCALAR_SAMPLING_Y_RATIO (15)
|
||||
#define SCALAR_DEPTH (16)
|
||||
#define SCALAR_FORMAT (17)
|
||||
|
||||
#define ROI_ALIGN_PARAM_NUM 17
|
||||
#define ROI_ALIGN_PARAM_NUM 18
|
||||
#define ROI_ALIGN_QUANT_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def )
|
||||
|
||||
/*
|
||||
|
|
@ -143,12 +145,8 @@ DEF_KERNEL_INITIALIZER(_roi_align_initializer)
|
|||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.dim = 3;
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (
|
||||
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[0] = out_shape->data[0];
|
||||
gpu_param.global_size[1] = out_shape->data[1];
|
||||
gpu_param.global_size[2] = rois_shape->data[1];
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
|
|
@ -213,7 +211,8 @@ static vsi_status _query_kernel
|
|||
kernel->info.numParams = (uint32_t)param_def_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
|
|
@ -259,8 +258,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float width_scale = roi_scale / width_ratio;
|
||||
float height_scale = roi_scale / height_ratio;
|
||||
float in_width = (float)(inputs[0]->attr.size[0]);
|
||||
float in_height = (float)(inputs[0]->attr.size[1]);
|
||||
int32_t in_width = (int32_t)(inputs[0]->attr.size[0]);
|
||||
int32_t in_height = (int32_t)(inputs[0]->attr.size[1]);
|
||||
float rcp_of_out_width = 1.0f / (float)(outputs[0]->attr.size[0]);
|
||||
float rcp_of_out_height = 1.0f / (float)(outputs[0]->attr.size[1]);
|
||||
float sampling_x_ratio = width_sample_num > 0 ? (float)width_sample_num : 0;
|
||||
|
|
@ -294,6 +293,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
if ( VSI_SUCCESS == status )
|
||||
{
|
||||
int32_t out_dtype = (int32_t)vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
int32_t dtype = out_dtype == F16 ? 1 : out_dtype == F32 ? 2 : 0;
|
||||
size_t node_params_num = ROI_ALIGN_PARAM_NUM;
|
||||
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -309,13 +310,14 @@ static vsi_nn_kernel_node_t _setup
|
|||
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
|
||||
node_params[SCALAR_SPATIAL_X_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &width_scale );
|
||||
node_params[SCALAR_SPATIAL_Y_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &height_scale );
|
||||
node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &in_width );
|
||||
node_params[SCALAR_INPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &in_height );
|
||||
node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, I32, &in_width );
|
||||
node_params[SCALAR_INPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, I32, &in_height );
|
||||
node_params[SCALAR_RCP_OF_OUTPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_width );
|
||||
node_params[SCALAR_RCP_OF_OUTPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_height );
|
||||
node_params[SCALAR_SAMPLING_X_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &sampling_x_ratio );
|
||||
node_params[SCALAR_SAMPLING_Y_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &sampling_y_ratio );
|
||||
node_params[SCALAR_DEPTH] = vsi_nn_kernel_scalar_create( graph, I32, &depth );
|
||||
node_params[SCALAR_FORMAT] = vsi_nn_kernel_scalar_create( graph, I32, &dtype );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
|
||||
|
|
@ -332,6 +334,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_X_RATIO] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_Y_RATIO] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_FORMAT] );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,351 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_SCATTER_ELEMENTS,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define _KERNEL_SOURCE0 "scatter_elements"
|
||||
#define _KERNEL_SOURCE1 "scatter_elements_add"
|
||||
#define _KERNEL_SOURCE2 "scatter_elements_mul"
|
||||
|
||||
#define STR(a) #a
|
||||
// Add kernel hashtable here
|
||||
#define SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, REDUCTION ) \
|
||||
(( IN0_DTYPE ) | ( IN2_DTYPE << 8 ) | ( OUT_DTYPE << 16 ) | ( AXIS << 24 ) | ( REDUCTION << 28 ))
|
||||
|
||||
#define PACK_KERNEL_NONE_MAP( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS ) \
|
||||
{ SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, VSI_NN_REDUCTION_TYPE_NONE ), \
|
||||
CVIVANTE_NAMESPACE("cl.scatter_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE) \
|
||||
"_I32_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
_KERNEL_SOURCE0 }
|
||||
|
||||
#define PACK_KERNEL_ADD_MAP( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS ) \
|
||||
{ SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, VSI_NN_REDUCTION_TYPE_ADD ), \
|
||||
CVIVANTE_NAMESPACE("cl.scatter_elements_add_axis"STR(AXIS)"_"STR(IN0_DTYPE) \
|
||||
"_I32_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
_KERNEL_SOURCE1 }
|
||||
|
||||
#define PACK_KERNEL_MUL_MAP( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS ) \
|
||||
{ SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, VSI_NN_REDUCTION_TYPE_MUL ), \
|
||||
CVIVANTE_NAMESPACE("cl.scatter_elements_mul_axis"STR(AXIS)"_"STR(IN0_DTYPE) \
|
||||
"_I32_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
_KERNEL_SOURCE2 }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
|
||||
#define PACK_KERNELS_MAP(type) \
|
||||
PACK_KERNEL_NONE_MAP( type, type, type, 0 ), \
|
||||
PACK_KERNEL_NONE_MAP( type, type, type, 1 ), \
|
||||
PACK_KERNEL_ADD_MAP( type, type, type, 0 ), \
|
||||
PACK_KERNEL_ADD_MAP( type, type, type, 1 ), \
|
||||
PACK_KERNEL_MUL_MAP( type, type, type, 0 ), \
|
||||
PACK_KERNEL_MUL_MAP( type, type, type, 1 ), \
|
||||
PACK_KERNEL_MUL_MAP( type, type, type, 2 )
|
||||
|
||||
static const _kernel_map_type _scatter_elements_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNELS_MAP( I8 ),
|
||||
PACK_KERNELS_MAP( U8 ),
|
||||
PACK_KERNELS_MAP( I16 ),
|
||||
PACK_KERNELS_MAP( F16 ),
|
||||
PACK_KERNELS_MAP( I32 ),
|
||||
PACK_KERNELS_MAP( F32 ),
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _scatter_elements_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _SCATTER_ELEMENTS_PARAM_NUM _cnt_of_array( _scatter_elements_kernel_param_def )
|
||||
#define SCALAR_INPUT_AXIS (4)
|
||||
#define SCALAR_INPUT_REDUCTION (5)
|
||||
#define SCALAR_REF_SCALE (6)
|
||||
#define SCALAR_REF_TAIL (7)
|
||||
#define SCALAR_UPDATE_SCALE (8)
|
||||
#define SCALAR_UPDATE_TAIL (9)
|
||||
#define SCALAR_OUTPUT_ZP (10)
|
||||
#define SCALAR_INDICES_INNER (11)
|
||||
#define SCALAR_INDICES_AXIS (12)
|
||||
#define SCALAR_INDICES_OUTER (13)
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_scatter_elements_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = output_attr->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
|
||||
gpu_param.global_size[0] = out_shape->data[0];
|
||||
gpu_param.global_size[1] = out_shape->data[1];
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(output_attr);
|
||||
return status;
|
||||
} /* _scatter_elements_initializer() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t axis,
|
||||
int32_t reduction
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
vsi_nn_kernel_dtype_e in1_dtype;
|
||||
vsi_nn_kernel_dtype_e in2_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _scatter_elements_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _scatter_elements_kernel_map );
|
||||
vx_param_description_t * param_def = _scatter_elements_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _scatter_elements_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||
in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (in1_dtype != I32)
|
||||
{
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
key = SCATTER_ELEMENTS_HASH_KEY( in0_dtype, in2_dtype, out_dtype, axis, reduction );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _scatter_elements_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_SCATTER_ELEMENTS_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_tensor_t* reshape_tensors[4] = { NULL };
|
||||
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
|
||||
uint32_t rank_in = 0;
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
int32_t reduction = vsi_nn_kernel_param_get_int32(params, "reduction");
|
||||
int32_t new_axis0 = 0;
|
||||
int32_t new_axis1 = 0;
|
||||
int32_t inner_size = 0;
|
||||
int32_t axis_size = 0;
|
||||
int32_t outer_size = 0;
|
||||
vsi_bool ret = FALSE;
|
||||
float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float input2_scale = vsi_nn_get_tensor_scale(inputs[2]);
|
||||
float input2_tail = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
|
||||
|
||||
#define MAX_SHAPE_SIZE (0xFFFFFFFF)
|
||||
ret = vsi_nn_kernel_optimize_scatter_elements_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
||||
shapes[0], &rank_in, &new_axis0, MAX_SHAPE_SIZE);
|
||||
ret &= vsi_nn_kernel_optimize_scatter_elements_shape(
|
||||
inputs[1]->attr.size, inputs[1]->attr.dim_num, axis,
|
||||
shapes[1], &rank_in, &new_axis1, MAX_SHAPE_SIZE);
|
||||
#undef MAX_SHAPE_SIZE
|
||||
|
||||
|
||||
if ( ret && new_axis0 == new_axis1 )
|
||||
{
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], rank_in );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[1], shapes[1], rank_in );
|
||||
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[2], shapes[1], rank_in );
|
||||
reshape_tensors[3] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[0], rank_in );
|
||||
|
||||
inner_size = new_axis0 == 0 ? 1 : (int32_t)shapes[1][0];
|
||||
axis_size = new_axis0 == 0 ? (int32_t)shapes[1][0] : (int32_t)shapes[1][1];
|
||||
outer_size = new_axis0 == 0 ? (int32_t)shapes[1][1] : rank_in > 2 ? (int32_t)shapes[1][2] : 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, axis, reduction );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
input0_scale = input0_scale / output_scale;
|
||||
input0_tail = - input0_tail * input0_scale;
|
||||
input2_scale = input2_scale / output_scale;
|
||||
input2_tail = - input2_tail * input2_scale;
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _SCATTER_ELEMENTS_PARAM_NUM,
|
||||
reshape_tensors, input_num, &reshape_tensors[3], output_num );
|
||||
/* Pass parameters to node. */
|
||||
node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, &new_axis0 );
|
||||
node_params[SCALAR_INPUT_REDUCTION] = vsi_nn_kernel_scalar_create(graph, I32, &reduction );
|
||||
node_params[SCALAR_REF_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
|
||||
node_params[SCALAR_REF_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input0_tail );
|
||||
node_params[SCALAR_UPDATE_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input2_scale );
|
||||
node_params[SCALAR_UPDATE_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input2_tail );
|
||||
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(graph, F32, &output_zp );
|
||||
node_params[SCALAR_INDICES_INNER] = vsi_nn_kernel_scalar_create(graph, I32, &inner_size );
|
||||
node_params[SCALAR_INDICES_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, &axis_size );
|
||||
node_params[SCALAR_INDICES_OUTER] = vsi_nn_kernel_scalar_create(graph, I32, &outer_size );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ELEMENTS_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_REDUCTION] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_REF_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_REF_TAIL] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_UPDATE_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_UPDATE_TAIL] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
|
||||
}
|
||||
}
|
||||
|
||||
vsi_safe_release_tensor( reshape_tensors[0] );
|
||||
vsi_safe_release_tensor( reshape_tensors[1] );
|
||||
vsi_safe_release_tensor( reshape_tensors[2] );
|
||||
vsi_safe_release_tensor( reshape_tensors[3] );
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( scatter_elements, _setup )
|
||||
|
||||
|
|
@ -0,0 +1,229 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.bucketize")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _bucketize_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _BUCKETIZE_PARAM_NUM _cnt_of_array( _bucketize_kernel_param_def )
|
||||
#define SCALAR_RIGHT_VALUE (3)
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float *f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
|
||||
vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
vsi_size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
vsi_size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
uint32_t i = 0, j = 0;
|
||||
int32_t right = 0;
|
||||
uint32_t boundaries_size = 0;
|
||||
|
||||
/* prepare data */
|
||||
for(i = 0; i < _INPUT_NUM; i ++)
|
||||
{
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
|
||||
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
|
||||
}
|
||||
for(i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
out_bytes[i] = out_elements[i] * sizeof(float);
|
||||
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
|
||||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( f32_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
|
||||
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_RIGHT_VALUE], &(right));
|
||||
|
||||
boundaries_size = (uint32_t)in_attr[1]->shape->data[0];
|
||||
|
||||
for (i = 0; i < out_elements[0]; i++)
|
||||
{
|
||||
float src0 = f32_in_buffer[0][i];
|
||||
float dst = 0;
|
||||
|
||||
for (j = 0; j < boundaries_size; j++)
|
||||
{
|
||||
float src1 = f32_in_buffer[1][j];
|
||||
|
||||
if (right == 1)
|
||||
{
|
||||
dst += (src0 >= src1 ? 1.0f : 0.0f);
|
||||
}
|
||||
else
|
||||
{
|
||||
dst += (src0 > src1 ? 1.0f : 0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
f32_out_buffer[0][i] = dst;
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
|
||||
f32_out_buffer[i], out_elements[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
if (f32_in_buffer[i])
|
||||
{
|
||||
free(f32_in_buffer[i]);
|
||||
f32_in_buffer[i] = NULL;
|
||||
}
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
if (f32_out_buffer[i])
|
||||
{
|
||||
free(f32_out_buffer[i]);
|
||||
f32_out_buffer[i] = NULL;
|
||||
}
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _bucketize_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _bucketize_kernel_param_def );
|
||||
status = VSI_SUCCESS;
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_BUCKETIZE_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t right = vsi_nn_kernel_param_get_int32( params, "right" );
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _BUCKETIZE_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
node_params[SCALAR_RIGHT_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &right );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _BUCKETIZE_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_RIGHT_VALUE] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( bucketize, _setup )
|
||||
|
||||
|
|
@ -0,0 +1,264 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (1)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.lppool")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _lppool_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _LPPOOL_PARAM_NUM _cnt_of_array( _lppool_kernel_param_def )
|
||||
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_lppool_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float * buffer[_INPUT_NUM + _OUTPUT_NUM] = { NULL };
|
||||
size_t out_elements = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[_INPUT_NUM + _OUTPUT_NUM] = { NULL };
|
||||
int32_t ksize_x = 0, ksize_y = 0, stride_x = 0, stride_y = 0;
|
||||
int32_t pad_left = 0, pad_right = 0, pad_top = 0, pad_bottom = 0;
|
||||
int32_t p = 0;
|
||||
int32_t i = 0;
|
||||
input[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
output[0] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( input[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( output[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &ksize_x);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &ksize_y);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &pad_left);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &pad_right);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &pad_top);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &pad_bottom);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &stride_x);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &stride_y);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &p);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( input[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
|
||||
|
||||
buffer[1] = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
|
||||
memset( buffer[1], 0, out_elements * sizeof(float) );
|
||||
|
||||
{
|
||||
int32_t batch = (int32_t)attr[1]->shape->data[2];
|
||||
int32_t height_o = (int32_t)attr[1]->shape->data[1];
|
||||
int32_t width_o = (int32_t)attr[1]->shape->data[0];
|
||||
int32_t height = (int32_t)attr[0]->shape->data[1];
|
||||
int32_t width = (int32_t)attr[0]->shape->data[0];
|
||||
int32_t b = 0, j = 0;
|
||||
int32_t output_base = 0;
|
||||
int32_t input_base = 0;
|
||||
float data = 0;
|
||||
for (b = 0; b < batch; b++)
|
||||
{
|
||||
output_base = b * height_o * width_o;
|
||||
input_base = b * height * width;
|
||||
for (j = 0; j < height_o; j++)
|
||||
{
|
||||
for (i = 0; i < width_o; i++)
|
||||
{
|
||||
int32_t hstart = j * stride_y - pad_top;
|
||||
int32_t wstart = i * stride_x - pad_left;
|
||||
int32_t hend = vsi_nn_min(hstart + ksize_y, height);
|
||||
int32_t wend = vsi_nn_min(wstart + ksize_x, width);
|
||||
int32_t pool_index = output_base + j * width_o + i;
|
||||
int32_t h = 0, w = 0;
|
||||
float sum_of_pow = 0;
|
||||
float out_data = 0;
|
||||
hstart = vsi_nn_max(hstart, 0);
|
||||
wstart = vsi_nn_max(wstart, 0);
|
||||
|
||||
for (h = hstart; h < hend; ++ h)
|
||||
{
|
||||
for (w = wstart; w < wend; ++ w)
|
||||
{
|
||||
int32_t index = input_base + h * width + w;
|
||||
data = buffer[0][index];
|
||||
sum_of_pow += (float)pow(fabs(data),p);
|
||||
}
|
||||
}
|
||||
out_data = (float)pow(sum_of_pow, 1.0f / p);
|
||||
buffer[1][pool_index] = out_data;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[0], attr[1],
|
||||
buffer[1], out_elements );
|
||||
final:
|
||||
for ( i = 0; i < _INPUT_NUM + _OUTPUT_NUM; i ++ )
|
||||
{
|
||||
vsi_nn_safe_free( buffer[i] );
|
||||
if (attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _lppool_exec() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _lppool_exec;
|
||||
kernel->info.parameters = _lppool_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _lppool_kernel_param_def );
|
||||
status = VSI_SUCCESS;
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_LPPOOL_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
int32_t ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x");
|
||||
int32_t ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y");
|
||||
int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x");
|
||||
int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y");
|
||||
int32_t pad_left = vsi_nn_kernel_param_get_int32(params, "pad_left");
|
||||
int32_t pad_right = vsi_nn_kernel_param_get_int32(params, "pad_right");
|
||||
int32_t pad_top = vsi_nn_kernel_param_get_int32(params, "pad_top");
|
||||
int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom");
|
||||
int32_t p = vsi_nn_kernel_param_get_int32(params, "p");
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
int32_t index = 2;
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _LPPOOL_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_right );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_bottom );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &p );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _LPPOOL_PARAM_NUM );
|
||||
VSI_ASSERT( status == VSI_SUCCESS );
|
||||
vsi_nn_kernel_scalar_release( &node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( lppool, _setup )
|
||||
|
||||
|
|
@ -56,14 +56,26 @@ static vsi_ssize_t _expand_offset
|
|||
vsi_size_t i;
|
||||
vsi_ssize_t offset = 0;
|
||||
|
||||
for( i = 0; i < rank && index; i ++ )
|
||||
for ( i = 0; i < rank && index; i ++ )
|
||||
{
|
||||
if( shape[i] == out_shape[i] )
|
||||
if (strides[0] == 0)
|
||||
{
|
||||
if (i == 0)
|
||||
{
|
||||
offset += (index % out_shape[0]);
|
||||
}
|
||||
else
|
||||
{
|
||||
offset += (vsi_ssize_t)strides[i] * 2 * ( index % out_shape[i] );
|
||||
}
|
||||
}
|
||||
else if ( shape[i] == out_shape[i] )
|
||||
{
|
||||
offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
|
||||
}
|
||||
index /= out_shape[i];
|
||||
}
|
||||
|
||||
return offset;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -52,14 +52,26 @@ static vsi_ssize_t _expand_offset
|
|||
vsi_size_t i;
|
||||
vsi_ssize_t offset = 0;
|
||||
|
||||
for( i = 0; i < rank && index; i ++ )
|
||||
for ( i = 0; i < rank && index; i ++ )
|
||||
{
|
||||
if( shape[i] == out_shape[i] )
|
||||
if (strides[0] == 0)
|
||||
{
|
||||
if (i == 0)
|
||||
{
|
||||
offset += (index % out_shape[0]);
|
||||
}
|
||||
else
|
||||
{
|
||||
offset += (vsi_ssize_t)strides[i] * 2 * ( index % out_shape[i] );
|
||||
}
|
||||
}
|
||||
else if ( shape[i] == out_shape[i] )
|
||||
{
|
||||
offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
|
||||
}
|
||||
index /= out_shape[i];
|
||||
}
|
||||
|
||||
return offset;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,405 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _CPU_ARG_NUM (11)
|
||||
#define _CPU_INPUT_NUM (1)
|
||||
#define _CPU_OUTPUT_NUM (1)
|
||||
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
||||
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.pre_process_yuv422_sw")
|
||||
|
||||
#define DESCALE(x) (((x) + (1<<19)) >> 20)
|
||||
|
||||
static vx_param_description_t kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
|
||||
DEF_KERNEL_EXECUTOR(_pre_process_yuv422_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
|
||||
float * buffer[_CPU_IO_NUM] = { NULL };
|
||||
float * outBuffer = NULL;
|
||||
size_t out_elements = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
|
||||
uint32_t i = 0;
|
||||
int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
|
||||
float rMean = 0, gMean = 0, bMean = 0, var = 0;
|
||||
int32_t order = 0, trans = 0, yuv422_type = 0;
|
||||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
|
||||
|
||||
i = 2;
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yuv422_type);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
|
||||
|
||||
buffer[1] = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
|
||||
memset( buffer[1], 0, out_elements * sizeof(float) );
|
||||
|
||||
if(trans)
|
||||
{
|
||||
outBuffer = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final );
|
||||
memset( outBuffer, 0, out_elements * sizeof(float) );
|
||||
}
|
||||
|
||||
{
|
||||
int32_t dx, dy, dz;
|
||||
int32_t src_width = (int32_t)attr[0]->shape->data[0];
|
||||
int32_t dst_width = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]);
|
||||
int32_t dst_height = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[1]);
|
||||
int32_t stride = (int32_t)(dst_width * dst_height);
|
||||
int32_t rOffset = 0;
|
||||
int32_t gOffset = 1 * stride;
|
||||
int32_t bOffset = 2 * stride;
|
||||
float D0, D1, E0, E1;
|
||||
float R0, G0, B0, R1, G1, B1;
|
||||
float min = 0;
|
||||
float max = 255;
|
||||
float* src_y_slice = NULL;
|
||||
|
||||
uint32_t roi_width = (xRatio * dst_width) >> 15;
|
||||
uint32_t roi_height = (yRatio * dst_height) >> 15;
|
||||
uint32_t xrIntFloat_16 = (roi_width << 16) / dst_width + 1;
|
||||
uint32_t yrIntFloat_16 = (roi_height << 16) / dst_height + 1;
|
||||
uint32_t srcy = 0, srcx = 0;
|
||||
|
||||
if(attr[1]->dtype == I8)
|
||||
{
|
||||
min = -128;
|
||||
max = 127;
|
||||
}
|
||||
else if(attr[1]->dtype == I16 || attr[1]->dtype == F16)
|
||||
{
|
||||
min = -32768;
|
||||
max = 32767;
|
||||
}
|
||||
|
||||
if(order)
|
||||
{
|
||||
rOffset = 2 * stride;
|
||||
bOffset = 0;
|
||||
}
|
||||
|
||||
for ( dz = 0; dz < 1; dz ++)
|
||||
{
|
||||
for ( dy = 0; dy < (int32_t)dst_height; dy++)
|
||||
{
|
||||
srcy = (((uint32_t)dy * yrIntFloat_16) >> 16) + yOffset;
|
||||
src_y_slice = buffer[0] + (srcy) * src_width;
|
||||
for ( dx = 0; dx < (int32_t)dst_width; dx += 2)
|
||||
{
|
||||
int32_t output_index = 0;
|
||||
int32_t dstR_idx = 0, dstG_idx = 0, dstB_idx = 0;
|
||||
float tmpY0 = 0.0f;
|
||||
float tmpY1 = 0.0f;
|
||||
float tmpU0 = 0.0f;
|
||||
float tmpU1 = 0.0f;
|
||||
float tmpV0 = 0.0f;
|
||||
float tmpV1 = 0.0f;
|
||||
|
||||
srcx = ((((uint32_t)dx * xrIntFloat_16) >> 16) + xOffset) * 2;
|
||||
|
||||
if (xrIntFloat_16 >> 16 == 1)
|
||||
{
|
||||
if (yuv422_type == 1)
|
||||
{
|
||||
tmpY0 = src_y_slice[srcx + 1];
|
||||
tmpU0 = src_y_slice[srcx];
|
||||
tmpY1 = src_y_slice[srcx + 3];
|
||||
tmpV0 = src_y_slice[srcx + 2];
|
||||
tmpU1 = tmpU0;
|
||||
tmpV1 = tmpV0;
|
||||
}
|
||||
else
|
||||
{
|
||||
tmpY0 = src_y_slice[srcx];
|
||||
tmpU0 = src_y_slice[srcx + 1];
|
||||
tmpY1 = src_y_slice[srcx + 2];
|
||||
tmpV0 = src_y_slice[srcx + 3];
|
||||
tmpU1 = tmpU0;
|
||||
tmpV1 = tmpV0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (yuv422_type == 1)
|
||||
{
|
||||
tmpY0 = src_y_slice[srcx + 1];
|
||||
tmpU0 = src_y_slice[(srcx / 4) * 4];
|
||||
tmpV0 = src_y_slice[(srcx / 4) * 4 + 2];
|
||||
srcx = (((uint32_t)(dx + 1) * xrIntFloat_16) >> 16) + xOffset;
|
||||
srcx = srcx * 2;
|
||||
tmpY1 = src_y_slice[srcx + 1];
|
||||
tmpU1 = src_y_slice[(srcx / 4) * 4];
|
||||
tmpV1 = src_y_slice[(srcx / 4) * 4 + 2];
|
||||
}
|
||||
else
|
||||
{
|
||||
tmpY0 = src_y_slice[srcx];
|
||||
tmpU0 = src_y_slice[(srcx / 4) * 4 + 1];
|
||||
tmpV0 = src_y_slice[(srcx / 4) * 4 + 3];
|
||||
srcx = (((uint32_t)(dx + 1) * xrIntFloat_16) >> 16) + xOffset;
|
||||
srcx = srcx * 2;
|
||||
tmpY1 = src_y_slice[srcx];
|
||||
tmpU1 = src_y_slice[(srcx / 4) * 4 + 1];
|
||||
tmpV1 = src_y_slice[(srcx / 4) * 4 + 3];
|
||||
}
|
||||
}
|
||||
|
||||
D0 = (tmpU0 - 128);
|
||||
E0 = (tmpV0 - 128);
|
||||
D1 = (tmpU1 - 128);
|
||||
E1 = (tmpV1 - 128);
|
||||
|
||||
B0 = (float)vsi_clamp((tmpY0 + (1.7790 * D0)), min, max);
|
||||
G0 = (float)vsi_clamp((tmpY0 - 0.3455 * D0 - 0.7169 * E0), min, max);
|
||||
R0 = (float)vsi_clamp((tmpY0 + 1.4065 * E0), min, max);
|
||||
|
||||
B1 = (float)vsi_clamp((tmpY1 + (1.7790 * D1)), min, max);
|
||||
G1 = (float)vsi_clamp((tmpY1 - 0.3455 * D1 - 0.7169 * E1), min, max);
|
||||
R1 = (float)vsi_clamp((tmpY1 + 1.4065 * E1), min, max);
|
||||
|
||||
output_index = dx + dy * dst_width;
|
||||
|
||||
dstR_idx = output_index + rOffset;
|
||||
dstG_idx = output_index + gOffset;
|
||||
dstB_idx = output_index + bOffset;
|
||||
|
||||
buffer[1][dstB_idx] = (B0 - bMean) * var;
|
||||
buffer[1][dstG_idx] = (G0 - gMean) * var;
|
||||
buffer[1][dstR_idx] = (R0 - rMean) * var;
|
||||
|
||||
dstR_idx += 1;
|
||||
dstG_idx += 1;
|
||||
dstB_idx += 1;
|
||||
|
||||
buffer[1][dstB_idx] = (B1 - bMean) * var;
|
||||
buffer[1][dstG_idx] = (G1 - gMean) * var;
|
||||
buffer[1][dstR_idx] = (R1 - rMean) * var;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(trans)
|
||||
{
|
||||
vsi_size_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1};
|
||||
vsi_size_t perm[] = {1, 2, 0, 3};
|
||||
vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[1],
|
||||
shape, (uint32_t)attr[1]->shape->size, perm, VSI_NN_TYPE_FLOAT32);
|
||||
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
|
||||
outBuffer, out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
|
||||
buffer[1], out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
final:
|
||||
if(outBuffer)
|
||||
{
|
||||
free(outBuffer);
|
||||
}
|
||||
for( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
{
|
||||
if( buffer[i] )
|
||||
{
|
||||
free( buffer[i] );
|
||||
}
|
||||
if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
|
||||
}
|
||||
return status;
|
||||
} /* _pre_process_yuv422_exec() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _pre_process_yuv422_exec;
|
||||
kernel->info.parameters = kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( kernel_param_def );
|
||||
status = VSI_SUCCESS;
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_CPU_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
status = _query_kernel( kernel, inputs, outputs);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 2;
|
||||
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
|
||||
int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
|
||||
int32_t left = vsi_nn_kernel_param_get_int32( params, "left" );
|
||||
int32_t top = vsi_nn_kernel_param_get_int32( params, "top" );
|
||||
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
|
||||
float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
|
||||
float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
|
||||
float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
|
||||
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
|
||||
int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
|
||||
int32_t yuv422_type = vsi_nn_kernel_param_get_int32( params, "yuv422_type" );
|
||||
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _CPU_PARAM_NUM,
|
||||
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
|
||||
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &yuv422_type );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CPU_PARAM_NUM );
|
||||
CHECK_STATUS( status );
|
||||
vsi_nn_kernel_scalar_release( &node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[11] );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( pre_process_yuv422, _setup )
|
||||
|
||||
|
|
@ -73,7 +73,7 @@ static float _compute_region_coordinate(int32_t p, float bin_size, float roi_anc
|
|||
{
|
||||
const float region_start = p * bin_size + roi_anchor;
|
||||
|
||||
return vsi_nn_clamp(region_start, 0.0f, max_value - 1);
|
||||
return region_start;
|
||||
}
|
||||
|
||||
static float _roi_align_1x1(float *input_ptr,
|
||||
|
|
@ -88,53 +88,64 @@ static float _roi_align_1x1(float *input_ptr,
|
|||
int32_t grid_size_y,
|
||||
float region_end_y)
|
||||
{
|
||||
if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
|
||||
float avg = 0;
|
||||
int32_t iy = 0;
|
||||
int32_t ix = 0;
|
||||
// Iterate through the aligned pooling region
|
||||
for (iy = 0; iy < grid_size_y; ++iy)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
float avg = 0;
|
||||
int32_t iy = 0;
|
||||
int32_t ix = 0;
|
||||
// Iterate through the aligned pooling region
|
||||
for (iy = 0; iy < grid_size_y; ++iy)
|
||||
for (ix = 0; ix < grid_size_x; ++ix)
|
||||
{
|
||||
for (ix = 0; ix < grid_size_x; ++ix)
|
||||
{
|
||||
// Align the window in the middle of every bin
|
||||
float y = region_start_y + ((float)iy + 0.5f) * bin_size_y / (float)(grid_size_y);
|
||||
float x = region_start_x + ((float)ix + 0.5f) * bin_size_x / (float)(grid_size_x);
|
||||
// Align the window in the middle of every bin
|
||||
float y = region_start_y +
|
||||
((float)iy + 0.5f) * bin_size_y / (float)(grid_size_y);
|
||||
float x = region_start_x +
|
||||
((float)ix + 0.5f) * bin_size_x / (float)(grid_size_x);
|
||||
|
||||
// Interpolation in the [0,0] [0,1] [1,0] [1,1] square
|
||||
const int32_t y_low = (int32_t)y;
|
||||
const int32_t x_low = (int32_t)x;
|
||||
const int32_t y_high = vsi_nn_min(y_low + 1, height - 1);
|
||||
const int32_t x_high = vsi_nn_min(x_low + 1, width - 1);
|
||||
// Interpolation in the [0,0] [0,1] [1,0] [1,1] square
|
||||
const int32_t y_low = vsi_nn_min((int32_t)y, height - 1);
|
||||
const int32_t x_low = vsi_nn_min((int32_t)x, width - 1);
|
||||
const int32_t y_high = vsi_nn_min(y_low + 1, height - 1);
|
||||
const int32_t x_high = vsi_nn_min(x_low + 1, width - 1);
|
||||
|
||||
const float ly = y - y_low;
|
||||
const float lx = x - x_low;
|
||||
const float hy = 1.0f - ly;
|
||||
const float hx = 1.0f - lx;
|
||||
float ly = y - y_low;
|
||||
float lx = x - x_low;
|
||||
float hy = 1.0f - ly;
|
||||
float hx = 1.0f - lx;
|
||||
|
||||
const float w1 = hy * hx;
|
||||
const float w2 = hy * lx;
|
||||
const float w3 = ly * hx;
|
||||
const float w4 = ly * lx;
|
||||
float w1 = hy * hx;
|
||||
float w2 = hy * lx;
|
||||
float w3 = ly * hx;
|
||||
float w4 = ly * lx;
|
||||
|
||||
const float data1 = *(input_ptr + y_low * width + x_low);
|
||||
const float data2 = *(input_ptr + y_low * width + x_high);
|
||||
const float data3 = *(input_ptr + y_high * width + x_low);
|
||||
const float data4 = *(input_ptr + y_high * width + x_high);
|
||||
const float data1 = *(input_ptr + y_low * width + x_low);
|
||||
const float data2 = *(input_ptr + y_low * width + x_high);
|
||||
const float data3 = *(input_ptr + y_high * width + x_low);
|
||||
const float data4 = *(input_ptr + y_high * width + x_high);
|
||||
|
||||
avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
|
||||
}
|
||||
/* onnx: inverse elements are out of feature map boundary */
|
||||
if (x > width || x < -1 || y > height || y < -1) continue;
|
||||
|
||||
x = x_low >= width - 1 ? x_low : x;
|
||||
y = y_low >= height - 1 ? y_low : y;
|
||||
|
||||
ly = y - y_low;
|
||||
lx = x - x_low;
|
||||
hy = 1.0f - ly;
|
||||
hx = 1.0f - lx;
|
||||
|
||||
w1 = hy * hx;
|
||||
w2 = hy * lx;
|
||||
w3 = ly * hx;
|
||||
w4 = ly * lx;
|
||||
|
||||
avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
|
||||
}
|
||||
|
||||
avg /= grid_size_x * grid_size_y;
|
||||
|
||||
return avg;
|
||||
}
|
||||
|
||||
avg /= grid_size_x * grid_size_y;
|
||||
|
||||
return avg;
|
||||
}
|
||||
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,258 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _ARG_NUM (2)
|
||||
#define _INPUT_NUM (3)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
|
||||
#define _CPU_PARAM_NUM (_ARG_NUM + _CPU_IO_NUM)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.gather_elements")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _scatter_elements_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _SCATTER_ELEMENTS_PARAM_NUM _cnt_of_array( _scatter_elements_kernel_param_def )
|
||||
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
|
||||
float * buffer[3] = { NULL };
|
||||
int32_t* buffer_idx = NULL;
|
||||
size_t out_elements = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
|
||||
vsi_size_t a = 0;
|
||||
vsi_size_t o = 0;
|
||||
vsi_size_t i = 0;
|
||||
vsi_size_t outer_size[2] = {1, 1};
|
||||
vsi_size_t inner_size[2] = {1, 1};
|
||||
vsi_size_t axis_size[2] = {1, 1};
|
||||
int32_t axis = 0;
|
||||
int32_t reduction = 0;
|
||||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
|
||||
tensors[3] = (vsi_nn_kernel_tensor_t)param[3];
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
||||
attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &reduction);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
|
||||
|
||||
buffer_idx = (int32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer_idx, "Create input1 buffer fail.", final );
|
||||
|
||||
buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input0 buffer fail.", final );
|
||||
|
||||
buffer[2] = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
|
||||
memcpy( buffer[2], buffer[0], out_elements * sizeof(float) );
|
||||
|
||||
axis_size[0] = attr[0]->shape->data[axis];
|
||||
axis_size[1] = attr[1]->shape->data[axis];
|
||||
for (i = 0; i < (vsi_size_t)axis; ++i)
|
||||
{
|
||||
inner_size[0] *= attr[0]->shape->data[i];
|
||||
inner_size[1] *= attr[1]->shape->data[i];
|
||||
}
|
||||
|
||||
for (i = axis + 1; i < attr[1]->shape->size; ++i)
|
||||
{
|
||||
outer_size[0] *= attr[0]->shape->data[i];
|
||||
outer_size[1] *= attr[1]->shape->data[i];
|
||||
}
|
||||
|
||||
for (o = 0; o < outer_size[1]; o++)
|
||||
{
|
||||
for (a = 0; a < axis_size[1]; a++)
|
||||
{
|
||||
for (i = 0; i < inner_size[1]; i++)
|
||||
{
|
||||
vsi_ssize_t index = 0;
|
||||
vsi_size_t index0 = (o * axis_size[1] + a) * inner_size[1] + i;
|
||||
vsi_size_t index1 = 1;
|
||||
|
||||
index = (vsi_ssize_t)buffer_idx[index0];
|
||||
index1 = (o * axis_size[0] + index) * inner_size[0] + i;
|
||||
|
||||
switch (reduction)
|
||||
{
|
||||
case VSI_NN_REDUCTION_TYPE_NONE:
|
||||
buffer[2][index1] = buffer[1][index0];
|
||||
break;
|
||||
case VSI_NN_REDUCTION_TYPE_ADD:
|
||||
buffer[2][index1] += buffer[1][index0];
|
||||
break;
|
||||
case VSI_NN_REDUCTION_TYPE_MUL:
|
||||
buffer[2][index1] *= buffer[1][index0];
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
|
||||
buffer[2], out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
final:
|
||||
if ( buffer_idx )
|
||||
{
|
||||
free( buffer_idx );
|
||||
}
|
||||
for ( i = 0; i < 3; i ++ )
|
||||
{
|
||||
if ( buffer[i] )
|
||||
{
|
||||
free( buffer[i] );
|
||||
}
|
||||
}
|
||||
for ( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
{
|
||||
if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _scatter_elements_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _scatter_elements_kernel_param_def );
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_SCATTER_ELEMENTS_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
int32_t reduction = vsi_nn_kernel_param_get_int32( params, "reduction" );
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _SCATTER_ELEMENTS_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
|
||||
node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &reduction );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ELEMENTS_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( scatter_elements, _setup )
|
||||
|
||||
|
|
@ -0,0 +1,323 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_BUCKETIZE,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define STR(a) #a
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
|
||||
(( IN0_DTYPE ) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE << 16 ) | (RIGHT << 24) | (IMG_2D << 25))
|
||||
|
||||
#define PACK_KERNEL_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
|
||||
{ BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \
|
||||
CVIVANTE_NAMESPACE("evis.bucketize_right_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
|
||||
"bucketize" }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _bucketize_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_2D_MAP( F16, F16, I32, 1, 1 ),
|
||||
PACK_KERNEL_2D_MAP( I16, I16, I32, 1, 1 ),
|
||||
PACK_KERNEL_2D_MAP( U8, U8, I32, 1, 1 ),
|
||||
PACK_KERNEL_2D_MAP( I8, I8, I32, 1, 1 ),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _bucketize_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _BUCKETIZE_PARAM_NUM _cnt_of_array( _bucketize_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_bucketize_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * input0_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t * input1_attr = NULL;
|
||||
vsi_size_array_t * input0_shape = NULL;
|
||||
vsi_size_array_t * input1_shape = NULL;
|
||||
|
||||
input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( input0_attr, "Create tensor attr buffer fail.", final );
|
||||
input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( input1_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
input0_shape = input0_attr->shape;
|
||||
input1_shape = input1_attr->shape;
|
||||
|
||||
gpu_param.dim = 2;
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(input0_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (
|
||||
(input0_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniDataConvert_0_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000300, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniDataConvert_1_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000300, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
int32_t boundaries_size = (int32_t)input1_shape->data[0];
|
||||
int32_t boundaries_size_x8 = (boundaries_size / 8) * 8;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniDataConvert_0_4x4", &uniDataConvert_0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniDataConvert_1_4x4", &uniDataConvert_1_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "boundaries_size_x8", &boundaries_size_x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "boundaries_size", &boundaries_size);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(input0_attr);
|
||||
SAFE_FREE_TENSOR_ATTR(input1_attr);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
|
||||
return status;
|
||||
} /* _bucketize_initializer() */
|
||||
|
||||
static vsi_bool _bucketize_support_types
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t * input,
|
||||
vsi_nn_tensor_t * boundaries,
|
||||
int32_t right
|
||||
)
|
||||
{
|
||||
vsi_size_t width = input->attr.size[0];
|
||||
vsi_size_t height = input->attr.size[1];
|
||||
vsi_size_t boundaries_size = boundaries->attr.size[0];
|
||||
vsi_bool image_2d = FALSE;
|
||||
vsi_nn_kernel_dtype_e in_dtype = vsi_nn_kernel_map_dtype( input->attr.dtype.vx_type );
|
||||
|
||||
image_2d = (input->attr.dim_num == 2 || input->attr.size[2] == 1);
|
||||
|
||||
if ( vsi_nn_is_same_type(input, boundaries) == FALSE || right == 0 || image_2d == FALSE )
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (in_dtype == F16 && graph->ctx->config.evis.ver != VSI_NN_HW_EVIS_2)
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
#define MAX_16BITS_BOUNDARIES_SIZE (0xFFFF)
|
||||
if ( (in_dtype == F16 || in_dtype == I16) && boundaries_size > MAX_16BITS_BOUNDARIES_SIZE )
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
#undef MAX_16BITS_BOUNDARIES_SIZE
|
||||
|
||||
#define MAX_8BITS_BOUNDARIES_SIZE (0xFF)
|
||||
if ( (in_dtype == I8 || in_dtype == U8) && boundaries_size > MAX_8BITS_BOUNDARIES_SIZE )
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
#undef MAX_8BITS_BOUNDARIES_SIZE
|
||||
|
||||
#define INPUT_SIZE_ALIGN8 (8)
|
||||
if ( width % INPUT_SIZE_ALIGN8 != 0 && height != 1 )
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
#undef INPUT_SIZE_ALIGN8
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t right
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
vsi_nn_kernel_dtype_e in1_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _bucketize_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _bucketize_kernel_map );
|
||||
vx_param_description_t * param_def = _bucketize_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _bucketize_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = BUCKETIZE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, right, 1 );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _bucketize_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_BUCKETIZE_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t right = vsi_nn_kernel_param_get_int32( params, "right" );
|
||||
|
||||
if( !vsi_nn_kernel_gpu_check_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( _bucketize_support_types(graph, inputs[0], inputs[1], right) == FALSE )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, right );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _BUCKETIZE_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _BUCKETIZE_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( bucketize, _setup )
|
||||
|
||||
|
|
@ -158,7 +158,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
|
|||
if (srcFixPointPos >= 0)
|
||||
output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos);
|
||||
else if (srcFixPointPos < 0)
|
||||
output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos);
|
||||
output_scale *= 1.0f / (vx_float32) ((int64_t)1 << - srcFixPointPos);
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
|
||||
{
|
||||
|
|
|
|||
|
|
@ -47,7 +47,8 @@ __BEGIN_DECLS
|
|||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_SUMS,
|
||||
INTERNAL_KERNEL_NORM,
|
||||
INTERNAL_KERNEL_MEANS,
|
||||
INTERNAL_KERNEL_NORMS,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define KERNEL_SOURCE_0 "instance_normalization_0"
|
||||
|
|
@ -61,6 +62,9 @@ typedef enum
|
|||
#define HASH_INSTANCENORM_SUMS_SH_KERNEL_2D_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.instance_norm_sums_"#SRC0_TYPE"_2D")
|
||||
|
||||
#define HASH_INSTANCENORM_MEANS_SH_KERNEL_NAME() \
|
||||
CVIVANTE_NAMESPACE("evis.instance_norm_means")
|
||||
|
||||
#define HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"_F32to"#DST_TYPE)
|
||||
|
||||
|
|
@ -68,8 +72,8 @@ typedef enum
|
|||
CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"_F32to"#DST_TYPE"_2D")
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
|
||||
#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _img_2d) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_img_2d << 8))
|
||||
|
||||
#define TENSOR_INSTANCENORM_SUMS_KERNELS_3D(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \
|
||||
|
|
@ -81,6 +85,14 @@ typedef enum
|
|||
HASH_INSTANCENORM_SUMS_SH_KERNEL_2D_NAME(IN0_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define HASH_INSTANCENORM_MEANS_KEY(ALPHA_TYPE, BETA_TYPE) \
|
||||
((F32 << 24) | (ALPHA_TYPE << 16) | (BETA_TYPE << 8) | (F32))
|
||||
|
||||
#define TENSOR_INSTANCENORM_MEANS_KERNELS(ALPHA_TYPE, BETA_TYPE) \
|
||||
{ HASH_INSTANCENORM_MEANS_KEY(ALPHA_TYPE, BETA_TYPE), \
|
||||
HASH_INSTANCENORM_MEANS_SH_KERNEL_NAME(), \
|
||||
KERNEL_SOURCE_0 },
|
||||
|
||||
// normalization
|
||||
#define HASH_INSTANCENORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \
|
||||
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4))
|
||||
|
|
@ -117,6 +129,13 @@ static const _kernel_map_type _instancenorm_sums_kernel_map[] =
|
|||
TENSOR_INSTANCENORM_SUMS_KERNELS_2D( BF16, F32, KERNEL_SOURCE_3 )
|
||||
};
|
||||
|
||||
static const _kernel_map_type _instancenorm_means_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
TENSOR_INSTANCENORM_MEANS_KERNELS( F32, F32 )
|
||||
};
|
||||
|
||||
|
||||
static const _kernel_map_type _instancenorm_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
|
|
@ -162,15 +181,36 @@ static vx_param_description_t _instancenorm_sums_kernel_param_def[] =
|
|||
};
|
||||
#define _INSTANCENORM_SUMS_PARAM_NUM _cnt_of_array( _instancenorm_sums_kernel_param_def )
|
||||
|
||||
static vx_param_description_t _instancenorm_kernel_param_def[] =
|
||||
static vx_param_description_t _instancenorm_means_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _INSTANCENORM_MEANS_PARAM_NUM _cnt_of_array( _instancenorm_means_kernel_param_def )
|
||||
#define MEANS_EPS_SCL (4)
|
||||
#define MEANS_INPUT_SCALE_SCL (5)
|
||||
#define MEANS_INPUT_ZP_SCL (6)
|
||||
#define MEANS_OUTPUT_SCALE_SCL (7)
|
||||
#define MEANS_OUTPUT_ZP_SCL (8)
|
||||
#define MEANS_INV_MULTIPLIER_SCL (9)
|
||||
#define MEANS_GROUP_NUM_SCL (10)
|
||||
|
||||
static vx_param_description_t _instancenorm_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _INSTANCENORM_PARAM_NUM _cnt_of_array( _instancenorm_kernel_param_def )
|
||||
|
|
@ -195,7 +235,6 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
|
|||
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
|
||||
vsi_size_array_t * input_shape = NULL;
|
||||
int32_t rs_flag = 0;
|
||||
int32_t width = 0;
|
||||
int32_t height = 0;
|
||||
int32_t chn = 0;
|
||||
|
|
@ -212,7 +251,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
|
|||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &rs_flag);
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
input_shape = attr[0]->shape;
|
||||
|
|
@ -221,12 +260,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
|
|||
input_zp = (float)attr[0]->zero_point;
|
||||
|
||||
width = (int32_t)(input_shape->data[0]);
|
||||
height = (int32_t)(input_shape->data[1]);
|
||||
chn = (int32_t)(attr[1]->shape->data[1]);
|
||||
if (rs_flag)
|
||||
{
|
||||
height = height / chn;
|
||||
}
|
||||
|
||||
work_item_pixels = (float)height * 16;
|
||||
|
||||
|
|
@ -333,7 +367,6 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
|
|||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
OnError:
|
||||
|
|
@ -351,6 +384,55 @@ OnError:
|
|||
return status;
|
||||
}
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_instancenorm_means_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t shaderParam = {
|
||||
2, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
|
||||
vsi_size_array_t * input_shape = NULL;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
input_shape = attr[0]->shape;
|
||||
|
||||
shaderParam.global_scale[0] = 1;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
|
||||
shaderParam.global_size[0] = 1;
|
||||
shaderParam.global_size[1] = input_shape->data[1];
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
|
|
@ -366,52 +448,26 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL};
|
||||
vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL, NULL};
|
||||
vsi_size_array_t * input_shape = NULL;
|
||||
float input_scale = 1;
|
||||
float output_scale = 1;
|
||||
float input_zp = 0;
|
||||
float output_zp = 0;
|
||||
float inv_multiplier = 0;
|
||||
vx_uint32 group_num = 0;
|
||||
vx_int32 height = 0, width = 0, chn = 0;
|
||||
int32_t rs_flag = 0;
|
||||
vx_int32 width = 0, chn = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
|
||||
attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rs_flag);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
input_shape = attr[0]->shape;
|
||||
input_scale = attr[0]->scale;
|
||||
input_zp = (float)attr[0]->zero_point;
|
||||
output_scale = 1.0f / attr[3]->scale;
|
||||
output_zp = (float)attr[3]->zero_point;
|
||||
|
||||
width = (int32_t)(input_shape->data[0]);
|
||||
height = (int32_t)(input_shape->data[1]);
|
||||
chn = (int32_t)(attr[2]->shape->data[1]);
|
||||
if (rs_flag)
|
||||
{
|
||||
height = height / chn;
|
||||
}
|
||||
|
||||
inv_multiplier = (float)(1.0 / (width * height));
|
||||
|
||||
group_num = (width + 255) / 256;
|
||||
chn = (int32_t)(attr[1]->shape->data[1]);
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == BF16)
|
||||
{
|
||||
shaderParam.global_scale[0] = 8;
|
||||
group_num = (width + 127) / 128;
|
||||
}
|
||||
|
||||
shaderParam.global_scale[1] = 1;
|
||||
|
|
@ -521,12 +577,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \
|
||||
(IN0_TYPE | (OUT_TYPE << 16))
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[3]->dtype );
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "height", &height);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inv_multiplier", &inv_multiplier);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype );
|
||||
|
||||
switch( pack_key )
|
||||
{
|
||||
|
|
@ -535,7 +586,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
case _PACK_SELECT_KEY( U8, U8 ):
|
||||
case _PACK_SELECT_KEY( I8, I8 ):
|
||||
{
|
||||
if (attr[3]->dtype == F16)
|
||||
if (attr[2]->dtype == F16)
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
|
||||
&uniExtractHalf8_2x8);
|
||||
|
|
@ -544,11 +595,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
|
||||
&uniExtractInteger_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4",
|
||||
&uniDataToFP32_0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4",
|
||||
|
|
@ -567,7 +614,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
case _PACK_SELECT_KEY( F16, U8 ):
|
||||
case _PACK_SELECT_KEY( F16, I8 ):
|
||||
{
|
||||
if (attr[3]->dtype == F16)
|
||||
if (attr[2]->dtype == F16)
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
|
||||
&uniExtractHalf8_2x8);
|
||||
|
|
@ -577,14 +624,10 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
|
||||
&uniExtractInteger_2x8);
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4",
|
||||
&uniDataToFP32_0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4",
|
||||
&uniDataToFP32_1_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
|
|
@ -612,21 +655,18 @@ OnError:
|
|||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
if (attr[2])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[2] );
|
||||
attr[2] = NULL;
|
||||
}
|
||||
if (attr[3])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[3] );
|
||||
attr[3] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
|
@ -637,7 +677,9 @@ OnError:
|
|||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
const uint32_t hashkey,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_bool img_2d,
|
||||
_internal_kernel_e kernel_id
|
||||
/* Add extra params */
|
||||
)
|
||||
|
|
@ -649,6 +691,18 @@ static vsi_status _query_kernel
|
|||
size_t kernel_map_size = 0;
|
||||
size_t param_size = 0;
|
||||
uint32_t i = 0;
|
||||
uint32_t hashkey = 0;
|
||||
vsi_nn_kernel_dtype_e in0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e in1_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e in2_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e out_dtype = U8;
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||
in2_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
|
||||
in2_dtype = in2_dtype == F16 ? F32 : in2_dtype;
|
||||
|
||||
switch ( kernel_id )
|
||||
{
|
||||
|
|
@ -658,13 +712,23 @@ static vsi_status _query_kernel
|
|||
kernel_map_size = _cnt_of_array( _instancenorm_sums_kernel_map );
|
||||
param_def = _instancenorm_sums_kernel_param_def;
|
||||
param_size = _INSTANCENORM_SUMS_PARAM_NUM;
|
||||
hashkey = HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, img_2d );
|
||||
break;
|
||||
case INTERNAL_KERNEL_NORM:
|
||||
case INTERNAL_KERNEL_MEANS:
|
||||
initializer = _instancenorm_means_initializer;
|
||||
kernel_map = _instancenorm_means_kernel_map;
|
||||
kernel_map_size = _cnt_of_array( _instancenorm_means_kernel_map );
|
||||
param_def = _instancenorm_means_kernel_param_def;
|
||||
param_size = _INSTANCENORM_MEANS_PARAM_NUM;
|
||||
hashkey = HASH_INSTANCENORM_MEANS_KEY( in1_dtype, in2_dtype );
|
||||
break;
|
||||
case INTERNAL_KERNEL_NORMS:
|
||||
initializer = _instancenorm_initializer;
|
||||
kernel_map = _instancenorm_kernel_map;
|
||||
kernel_map_size = _cnt_of_array( _instancenorm_kernel_map );
|
||||
param_def = _instancenorm_kernel_param_def;
|
||||
param_size = _INSTANCENORM_PARAM_NUM;
|
||||
hashkey = HASH_INSTANCENORM_KEY( in0_dtype, F32, out_dtype, img_2d );
|
||||
break;
|
||||
default:
|
||||
VSI_ASSERT( FALSE );
|
||||
|
|
@ -709,23 +773,21 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
#define INTERNAL_KERNEL_SIZE (1)
|
||||
#define MEAN_VARI_INDEX (0)
|
||||
#define INTERNAL_KERNEL_SIZE (2)
|
||||
#define SUMS_INDEX (0)
|
||||
#define MEANS_INDEX (1)
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t sums_node_params[_INSTANCENORM_SUMS_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t means_node_params[_INSTANCENORM_MEANS_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t tmp_node = NULL;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_dtype_e in0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e in1_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e out_dtype = U8;
|
||||
vsi_nn_kernel_node_t sums_node = NULL;
|
||||
vsi_nn_kernel_node_t means_node = NULL;
|
||||
vsi_nn_kernel_node_t norms_node = NULL;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
|
||||
vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL };
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
|
||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
|
||||
uint32_t hashkey = 0;
|
||||
int32_t i = 0;
|
||||
int32_t axis[VSI_NN_MAX_DIM_NUM] = {0, 1};
|
||||
int32_t axis_num = 2;
|
||||
|
|
@ -735,35 +797,47 @@ static vsi_nn_kernel_node_t _setup
|
|||
uint32_t rank = outputs[0]->attr.dim_num;
|
||||
vsi_nn_tensor_t *reshape_tensor[2] = {NULL};
|
||||
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
|
||||
float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float in_time_out_scale = vsi_nn_get_tensor_scale(inputs[0]) * output_scale;
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float inv_multiplier = 1.0f / (float)(inputs[0]->attr.size[0] * inputs[0]->attr.size[1]);
|
||||
int32_t height = 0;
|
||||
int32_t group_num = 0;
|
||||
int32_t reshape_flg = 0;
|
||||
vsi_size_t batch = 1;
|
||||
vsi_bool ret = FALSE;
|
||||
|
||||
ret = vsi_nn_kernel_optimize_tensor_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num,
|
||||
axis, axis_num, new_shape, &rank, new_axis, &axis_size);
|
||||
if ( ret == FALSE || axis_size > 2 )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
memcpy(new_shape, inputs[0]->attr.size, sizeof(inputs[0]->attr.size));
|
||||
|
||||
for (i = 3; i < (int32_t)inputs[0]->attr.dim_num; i++)
|
||||
if (new_shape[0] >= GPU_TENSOR_MAX_WIDTH || new_shape[1] >= GPU_TENSOR_MAX_WIDTH)
|
||||
{
|
||||
batch *= inputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
if (axis_size == 1)
|
||||
{
|
||||
for (i = rank; i > 1; i--)
|
||||
ret = vsi_nn_kernel_optimize_tensor_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num,
|
||||
axis, axis_num, new_shape, &rank, new_axis, &axis_size);
|
||||
if ( ret == FALSE || axis_size > 2 )
|
||||
{
|
||||
new_shape[i] = new_shape[i - 1];
|
||||
return NULL;
|
||||
}
|
||||
new_shape[1] = 1;
|
||||
rank ++;
|
||||
|
||||
for (i = 3; i < (int32_t)inputs[0]->attr.dim_num; i++)
|
||||
{
|
||||
batch *= inputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
if (axis_size == 1)
|
||||
{
|
||||
for (i = rank; i > 1; i--)
|
||||
{
|
||||
new_shape[i] = new_shape[i - 1];
|
||||
}
|
||||
new_shape[1] = 1;
|
||||
rank ++;
|
||||
}
|
||||
new_shape[2] = rank == 2 ? 1 : new_shape[2] / batch;
|
||||
new_shape[3] = batch;
|
||||
rank = 4;
|
||||
}
|
||||
new_shape[2] = rank == 2 ? 1 : new_shape[2] / batch;
|
||||
new_shape[3] = batch;
|
||||
rank = 4;
|
||||
|
||||
reshape_tensor[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], new_shape, rank );
|
||||
|
|
@ -786,24 +860,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
ikernels[i]->unique_id = kernel->unique_id;
|
||||
}
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( reshape_tensor[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( reshape_tensor[1]->attr.dtype.vx_type );
|
||||
in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
|
||||
|
||||
hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, reshape_flg );
|
||||
hashkey = HASH_INSTANCENORM_KEY( in0_dtype, in1_dtype, out_dtype, reshape_flg );
|
||||
|
||||
status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_SUMS );
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
height = (int32_t)new_shape[1];
|
||||
|
||||
if (reshape_flg)
|
||||
{
|
||||
|
|
@ -816,6 +873,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
}
|
||||
else if (new_shape[0] < new_shape[1])
|
||||
{
|
||||
height = (int32_t)new_shape[0];
|
||||
|
||||
shape[0] = new_shape[1];
|
||||
shape[1] = new_shape[0];
|
||||
shape[2] = new_shape[2];
|
||||
|
|
@ -835,78 +894,121 @@ static vsi_nn_kernel_node_t _setup
|
|||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
attr.size[0] = ((shape[0] + 255) / 256) * 4;
|
||||
group_num = gpu_align_np2_safe((int32_t)shape[0], 256) / 256;
|
||||
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
|
||||
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16
|
||||
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16)
|
||||
{
|
||||
group_num = gpu_align_np2_safe((int32_t)shape[0], 128) / 128;
|
||||
attr.size[0] = ((shape[0] + 127) / 128) * 4;
|
||||
}
|
||||
attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
|
||||
attr.size[2] = 1;
|
||||
attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
attr.dim_num = 4;
|
||||
tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
|
||||
tensors[SUMS_INDEX] = vsi_nn_CreateTensor( graph, &attr );
|
||||
attr.size[0] = 4;
|
||||
tensors[MEANS_INDEX] = vsi_nn_CreateTensor( graph, &attr );
|
||||
|
||||
shape[0] = 1;
|
||||
shape[1] = rank > 2 ? new_shape[2] : 1;
|
||||
rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 2 );
|
||||
rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 2 );
|
||||
|
||||
// Mean Vari
|
||||
/* x0 = sum(x) and x1 = sum(x * x) */
|
||||
status = _query_kernel( ikernels[SUMS_INDEX], inputs, outputs, reshape_flg, INTERNAL_KERNEL_SUMS );
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
tmp_node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
|
||||
if (tmp_node)
|
||||
goto final;
|
||||
}
|
||||
|
||||
sums_node = vsi_nn_kernel_create_node( graph, ikernels[SUMS_INDEX] );
|
||||
if (sums_node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
|
||||
|
||||
sums_node_params[index++] = rs_input;
|
||||
vsi_nn_kernel_node_pack_io( &sums_node_params[index],
|
||||
_INSTANCENORM_SUMS_PARAM_NUM, NULL, 0, tensors, 1 );
|
||||
index = 2;
|
||||
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
|
||||
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( sums_node, sums_node_params,
|
||||
_INSTANCENORM_SUMS_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &sums_node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &sums_node_params[3] );
|
||||
{
|
||||
uint32_t index = 0;
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
|
||||
sums_node_params[index++] = rs_input;
|
||||
vsi_nn_kernel_node_pack_io( &sums_node_params[index],
|
||||
_INSTANCENORM_SUMS_PARAM_NUM, NULL, 0, tensors, 1 );
|
||||
index = 2;
|
||||
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
|
||||
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
|
||||
vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( tmp_node, sums_node_params,
|
||||
_INSTANCENORM_SUMS_PARAM_NUM );
|
||||
status = vxSetNodeAttribute( (vx_node)sums_node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &sums_node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &sums_node_params[3] );
|
||||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
|
||||
vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
|
||||
|
||||
status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
CHECK_STATUS(status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Nomalization
|
||||
/* a = input_scale * output_scale * alpha * mean
|
||||
b = (beta - scale * mean) * output_scale + output_zp - input * alpha */
|
||||
status = _query_kernel( ikernels[MEANS_INDEX], inputs, outputs, reshape_flg, INTERNAL_KERNEL_MEANS );
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if (node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
node_params[index++] = rs_input;
|
||||
node_params[index++] = rs_beta;
|
||||
node_params[index++] = rs_gamma;
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
|
||||
node_params[index++] = rs_output;
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params,
|
||||
_INSTANCENORM_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
}
|
||||
goto final;
|
||||
}
|
||||
|
||||
means_node = vsi_nn_kernel_create_node( graph, ikernels[MEANS_INDEX] );
|
||||
if (means_node)
|
||||
{
|
||||
means_node_params[0] = tensors[SUMS_INDEX]->t;
|
||||
means_node_params[1] = rs_beta;
|
||||
means_node_params[2] = rs_gamma;
|
||||
means_node_params[3] = tensors[MEANS_INDEX]->t;
|
||||
|
||||
means_node_params[MEANS_EPS_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
|
||||
means_node_params[MEANS_INPUT_SCALE_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &in_time_out_scale );
|
||||
means_node_params[MEANS_INPUT_ZP_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp );
|
||||
means_node_params[MEANS_OUTPUT_SCALE_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
|
||||
means_node_params[MEANS_OUTPUT_ZP_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
|
||||
means_node_params[MEANS_INV_MULTIPLIER_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &inv_multiplier );
|
||||
means_node_params[MEANS_GROUP_NUM_SCL] = vsi_nn_kernel_scalar_create( graph, I32, &group_num );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( means_node, means_node_params,
|
||||
_INSTANCENORM_MEANS_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &means_node_params[MEANS_EPS_SCL] );
|
||||
vsi_nn_kernel_scalar_release( &means_node_params[MEANS_INPUT_SCALE_SCL] );
|
||||
vsi_nn_kernel_scalar_release( &means_node_params[MEANS_INPUT_ZP_SCL] );
|
||||
vsi_nn_kernel_scalar_release( &means_node_params[MEANS_OUTPUT_SCALE_SCL] );
|
||||
vsi_nn_kernel_scalar_release( &means_node_params[MEANS_OUTPUT_ZP_SCL] );
|
||||
vsi_nn_kernel_scalar_release( &means_node_params[MEANS_INV_MULTIPLIER_SCL] );
|
||||
vsi_nn_kernel_scalar_release( &means_node_params[MEANS_GROUP_NUM_SCL] );
|
||||
}
|
||||
|
||||
/* dst = x * a + b */
|
||||
status = _query_kernel( kernel, inputs, outputs, reshape_flg, INTERNAL_KERNEL_NORMS );
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
norms_node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if (norms_node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
node_params[index++] = rs_input;
|
||||
node_params[index++] = tensors[MEANS_INDEX]->t;
|
||||
node_params[index++] = rs_output;
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( norms_node, node_params,
|
||||
_INSTANCENORM_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
}
|
||||
|
||||
/* Pass parameters to node. */
|
||||
final:
|
||||
vsi_safe_release_tensor(reshape_tensor[0]);
|
||||
vsi_safe_release_tensor(reshape_tensor[1]);
|
||||
|
|
@ -934,8 +1036,10 @@ final:
|
|||
}
|
||||
vsi_safe_release_tensor(tensors[i]);
|
||||
}
|
||||
if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
|
||||
return node;
|
||||
if (sums_node) {vsi_nn_kernel_node_release( &sums_node );}
|
||||
if (means_node) {vsi_nn_kernel_node_release( &means_node );}
|
||||
|
||||
return norms_node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -121,6 +121,7 @@ static const struct {
|
|||
TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, U8, U8, KERNEL_SOURCE_4)
|
||||
TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8, U8, F16, KERNEL_SOURCE_5)
|
||||
TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8, U8, U8, KERNEL_SOURCE_5)
|
||||
TENSOR_MATRIX_MUL_TRANSB_KERNELS(I16, I16, I16, KERNEL_SOURCE_13)
|
||||
TENSOR_MATRIX_MUL_TRANSB_KERNELS(BF16,BF16,BF16, KERNEL_SOURCE_15)
|
||||
TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8, U8, U8, KERNEL_SOURCE_7)
|
||||
TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8, I8, I8, KERNEL_SOURCE_7)
|
||||
|
|
@ -622,11 +623,33 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
|
|||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_dp_inst_t uniI16MulI16SumtoI32_16x1 = {{
|
||||
0xaaaa5555, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x76543210, 0x76543210, // ABin
|
||||
0xaaaa5555, // BSelt
|
||||
0x76543210, 0x00000000, // BBin
|
||||
0x00000300, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00020001, 0x00040003, 0x00060005, 0x00080007 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniI16MulI16SumtoI32B_16x1 = {{
|
||||
0x0002aaab, // TCfg
|
||||
0x00015554, // ASelt
|
||||
0x65432100, 0x00000007, // ABin
|
||||
0x0002aaa8, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002300, // AccumType, ConstantType, and PostShift
|
||||
0x00010000, 0x00030002, 0x00050004, 0x00070006,
|
||||
0x00000008, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
float scaleIn0divOut = src0Scale / dstScale;
|
||||
float scaleIn1divOut = src1Scale / dstScale;
|
||||
float inScaleMul = src0Scale * src1Scale;
|
||||
float reScaleOut = 1 / dstScale;
|
||||
float inScaledivOut = inScaleMul / dstScale;
|
||||
float inout_beta = src0ZP * src1ZP * 8 * inScaledivOut + dstZP;
|
||||
uint32_t multiplierA = (M0 << 16) | M0;
|
||||
uint32_t multiplierB = (M1 << 16) | M1;
|
||||
uint32_t multiplierZpA = (src0ZP << 16) | src0ZP;
|
||||
|
|
@ -647,6 +670,14 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
|
|||
uniGemmFp16U8MulZptoFp32_4x4.data[i] = multiplierZpB;
|
||||
uniGemmFp16I16MulZptoFp32_4x4.data[i] = multiplierZpB;
|
||||
}
|
||||
for( i = 8; i < 12; i++)
|
||||
{
|
||||
uniI16MulI16SumtoI32B_16x1.data[i] = multiplierZpA;
|
||||
}
|
||||
for( i = 12; i < 16; i++)
|
||||
{
|
||||
uniI16MulI16SumtoI32_16x1.data[i] = multiplierZpB;
|
||||
}
|
||||
|
||||
switch( pack_key )
|
||||
{
|
||||
|
|
@ -746,6 +777,8 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
|
|||
break;
|
||||
case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 0 ):
|
||||
case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 1 ):
|
||||
case _PACK_SELECT_KEY( I16, I16, I16, 0, 1, 0 ):
|
||||
case _PACK_SELECT_KEY( I16, I16, I16, 0, 1, 1 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
|
||||
|
|
@ -753,10 +786,16 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
|
|||
"uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniI16MulI16SumtoI32_16x1", &uniI16MulI16SumtoI32_16x1 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniI16MulI16SumtoI32B_16x1", &uniI16MulI16SumtoI32B_16x1 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inScaledivOut );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "inout_beta", &inout_beta );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, U8, F16, 0, 0, 0 ):
|
||||
|
|
|
|||
|
|
@ -43,14 +43,18 @@ __BEGIN_DECLS
|
|||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toI8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toI16")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toF16")
|
||||
|
||||
// greater than a quarter
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8_gq")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI8_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8_gq")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOF16_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toF16_gq")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI16_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI16_gq")
|
||||
|
||||
#define KERNEL_SOURCE_1 "pre_process_nv12_scale_8bits",
|
||||
#define KERNEL_SOURCE_1 "pre_process_nv12_copy",
|
||||
#define KERNEL_SOURCE_2 "pre_process_nv12_scale",
|
||||
#define KERNEL_SOURCE_4 "pre_process_nv12_scale_mix"
|
||||
|
||||
typedef enum
|
||||
{
|
||||
|
|
@ -78,13 +82,18 @@ static const struct {
|
|||
const char* source_name;
|
||||
} pre_process_nv12_map[] =
|
||||
{
|
||||
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I8, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I16, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, F16, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_4)
|
||||
TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_4)
|
||||
TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2)
|
||||
};
|
||||
|
||||
static vx_param_description_t vxPreProcessNv12Kernel_param_def[] =
|
||||
|
|
@ -120,8 +129,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
|||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
int32_t dstZP = 0;
|
||||
float dstScale = 1;
|
||||
float output_zp = 0;
|
||||
float output_scale = 1;
|
||||
int32_t reorder = 0;
|
||||
int32_t order1 = 2;
|
||||
uint32_t width = 0;
|
||||
|
|
@ -148,6 +157,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
output_scale = 1.0f / attr[0]->scale;
|
||||
output_zp = (float)attr[0]->zero_point;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
|
|
@ -157,33 +168,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
|||
order1 = 0;
|
||||
}
|
||||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
dstScale = 1.0f / attr[0]->asymm.scale;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
dstZP = 0;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
|
||||
{
|
||||
dstScale = 1;
|
||||
dstZP = 0;
|
||||
}
|
||||
|
||||
outputScaleVar = dstScale * var;
|
||||
bMeanScaleVarZp = (float)dstZP - bMean * outputScaleVar;
|
||||
gMeanScaleVarZp = (float)dstZP - gMean * outputScaleVar;
|
||||
rMeanScaleVarZp = (float)dstZP - rMean * outputScaleVar;
|
||||
outputScaleVar = output_scale * var;
|
||||
bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
|
||||
gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
|
||||
rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
|
||||
|
||||
shaderParam.global_scale[0] = 4;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
|
|
@ -249,18 +237,46 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
|||
0x00010001, 0x00010001, 0x00010001, 0x00010001,
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
|
||||
switch( attr[0]->dtype )
|
||||
{
|
||||
case U8:
|
||||
case I8:
|
||||
case I16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case F16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
|
||||
|
|
@ -288,8 +304,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
|
|||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
int32_t dstZP = 0;
|
||||
float dstScale = 1;
|
||||
float output_zp = 0;
|
||||
float output_scale = 1;
|
||||
int32_t reorder = 0;
|
||||
int32_t order1 = 2;
|
||||
uint32_t width = 0;
|
||||
|
|
@ -330,8 +346,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
dstZP = attr[1]->asymm.zero_point;
|
||||
dstScale = attr[1]->asymm.scale;
|
||||
output_scale = 1.0f / attr[1]->scale;
|
||||
output_zp = (float)attr[1]->zero_point;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
|
|
@ -347,32 +363,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
|
|||
xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
|
||||
yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
|
||||
|
||||
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
dstScale = 1.0f / dstScale;
|
||||
}
|
||||
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[1]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (vx_float32)((int64_t)1 << attr[1]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[1]->dfp.fl));
|
||||
}
|
||||
dstZP = 0;
|
||||
}
|
||||
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
|
||||
{
|
||||
dstScale = 1;
|
||||
dstZP = 0;
|
||||
}
|
||||
|
||||
outputScaleVar = dstScale * var;
|
||||
bMeanScaleVarZp = (float)dstZP - bMean * outputScaleVar;
|
||||
gMeanScaleVarZp = (float)dstZP - gMean * outputScaleVar;
|
||||
rMeanScaleVarZp = (float)dstZP - rMean * outputScaleVar;
|
||||
outputScaleVar = output_scale * var;
|
||||
bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
|
||||
gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
|
||||
rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
|
||||
|
||||
shaderParam.global_scale[0] = 4;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
|
|
@ -482,7 +476,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
|
||||
|
||||
if (resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16))
|
||||
if (resize >= 0.25)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8);
|
||||
|
|
@ -499,13 +493,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
|
|||
case I8:
|
||||
case I16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case F16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8);
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
|
|
@ -551,7 +545,7 @@ static vsi_status _query_kernel
|
|||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (enable_copy && output_dtype == U8)
|
||||
if (enable_copy)
|
||||
{
|
||||
convert_type = COPY;
|
||||
}
|
||||
|
|
@ -560,16 +554,16 @@ static vsi_status _query_kernel
|
|||
convert_type = SCALE;
|
||||
}
|
||||
|
||||
if (scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE)
|
||||
if (scaleVal >= 0.25 && convert_type == SCALE)
|
||||
{
|
||||
optFlg = 1;
|
||||
}
|
||||
|
||||
key = HASH_PRE_PROCESS_NV12_KEY( input0_dtype, output_dtype, convert_type, optFlg );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(pre_process_nv12_map); i ++ )
|
||||
for ( i = 0; i < _cnt_of_array(pre_process_nv12_map); i ++ )
|
||||
{
|
||||
if( pre_process_nv12_map[i].key == key )
|
||||
if ( pre_process_nv12_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
|
@ -580,7 +574,7 @@ static vsi_status _query_kernel
|
|||
kernel->info.parameters = vxPreProcessNv12Kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( vxPreProcessNv12Kernel_param_def );
|
||||
|
||||
if(convert_type == COPY)
|
||||
if (convert_type == COPY)
|
||||
{
|
||||
kernel->info.initialize = _pre_process_nv12_copy_initializer;
|
||||
}
|
||||
|
|
@ -666,10 +660,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_scalar_release( &tmp_params[12] );
|
||||
}
|
||||
}
|
||||
if(reshape_tensors[0])
|
||||
{
|
||||
vsi_nn_ReleaseTensor(&reshape_tensors[0]);
|
||||
}
|
||||
vsi_safe_release_tensor(reshape_tensors[0]);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
|
|
|
|||
|
|
@ -43,13 +43,13 @@ __BEGIN_DECLS
|
|||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toI8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toI16")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toF16")
|
||||
|
||||
#define KERNEL_SOURCE_1 "pre_process_yuv420_scale_u8",
|
||||
#define KERNEL_SOURCE_2 "pre_process_yuv420_copy_u8",
|
||||
#define KERNEL_SOURCE_3 "pre_process_yuv420_scale_fp16",
|
||||
#define KERNEL_SOURCE_4 "pre_process_yuv420_scale_i16",
|
||||
#define KERNEL_SOURCE_5 "pre_process_yuv420_scale_i8",
|
||||
#define KERNEL_SOURCE_0 "pre_process_yuv420_copy",
|
||||
#define KERNEL_SOURCE_1 "pre_process_yuv420_scale_0",
|
||||
#define KERNEL_SOURCE_2 "pre_process_yuv420_scale_1",
|
||||
|
||||
typedef enum
|
||||
{
|
||||
|
|
@ -73,12 +73,14 @@ static const struct {
|
|||
const char* source_name;
|
||||
} pre_process_yuv420_map[] =
|
||||
{
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_3)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_4)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_5)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, COPY, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY, KERNEL_SOURCE_0)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, COPY, KERNEL_SOURCE_0)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, COPY, KERNEL_SOURCE_0)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I16, COPY, KERNEL_SOURCE_0)
|
||||
};
|
||||
|
||||
static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] =
|
||||
|
|
@ -115,13 +117,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
|
|||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
int32_t dstZP = 0;
|
||||
float dstScale = 1;
|
||||
int32_t reorder = 0;
|
||||
int32_t trans = 0;
|
||||
int32_t order1 = 2;
|
||||
uint32_t width = 0;
|
||||
uint32_t height = 0;
|
||||
float output_zp = 0;
|
||||
float output_scale = 1;
|
||||
int32_t reorder = 0;
|
||||
int32_t trans = 0;
|
||||
int32_t order1 = 2;
|
||||
uint32_t width = 0;
|
||||
uint32_t height = 0;
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
|
@ -149,23 +151,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
|
|||
width = width / 3;
|
||||
}
|
||||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
dstScale = 1.0f / attr[0]->asymm.scale;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
dstZP = 0;
|
||||
}
|
||||
output_scale = 1.0f / attr[0]->scale;
|
||||
output_zp = (float)attr[0]->zero_point;
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
|
|
@ -426,8 +413,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
|
|||
}, GPU_DP_TYPE_16 };
|
||||
switch( attr[0]->dtype )
|
||||
{
|
||||
case I8:
|
||||
case U8:
|
||||
case F16:
|
||||
case I16:
|
||||
{
|
||||
// R
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4);
|
||||
|
|
@ -461,8 +450,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoR_2x8", &uniQuantU8toU8LoR_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiR_2x8", &uniQuantU8toU8HiR_2x8);
|
||||
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "zp", &dstZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
|
@ -497,8 +486,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
|
|||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
int32_t dstZP = 0;
|
||||
float dstScale = 1;
|
||||
float output_zp = 0;
|
||||
float output_scale = 1;
|
||||
int32_t reorder = 0;
|
||||
int32_t order1 = 2;
|
||||
uint32_t width = 0;
|
||||
|
|
@ -513,11 +502,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
|
|||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
dstScale = attr[0]->asymm.scale;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
out_shape = attr[0]->shape;
|
||||
output_zp = (float)attr[0]->zero_point;
|
||||
output_scale = 1.0f / attr[0]->scale;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
if (reorder != 0)
|
||||
{
|
||||
|
|
@ -525,28 +514,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
|
|||
order1 = 0;
|
||||
}
|
||||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
dstZP = 0;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
dstScale = 1.0f / dstScale;
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
dstScale = 1;
|
||||
dstZP = 0;
|
||||
}
|
||||
|
||||
shaderParam.global_scale[0] = 4;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
|
|
@ -822,24 +789,20 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
|
|||
switch( attr[0]->dtype )
|
||||
{
|
||||
case U8:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "zp", &dstZP);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case F16:
|
||||
case I8:
|
||||
case I16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case F16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8);
|
||||
if (attr[0]->dtype == F16)
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
|
||||
}
|
||||
else
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
|
|
@ -876,12 +839,14 @@ static vsi_status _query_kernel
|
|||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (enable_copy && (output_dtype == U8 || output_dtype == F16))
|
||||
if (enable_copy && (output_dtype == I8 || output_dtype == U8 || output_dtype == F16 || output_dtype == I16))
|
||||
{
|
||||
convert_type = COPY;
|
||||
enable_copy = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
enable_copy = FALSE;
|
||||
convert_type = SCALE;
|
||||
}
|
||||
|
||||
|
|
@ -900,7 +865,7 @@ static vsi_status _query_kernel
|
|||
kernel->info.parameters = vxPreProcessYuv420Kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def );
|
||||
|
||||
if (enable_copy && (output_dtype == U8 || output_dtype == F16))
|
||||
if (enable_copy)
|
||||
{
|
||||
kernel->info.initialize = _pre_process_yuv420_copy_initializer;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,623 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toF16")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toI16")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toI8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toI8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toI16")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toF16")
|
||||
|
||||
#define KERNEL_SOURCE_1 "pre_process_yuv422_copy",
|
||||
#define KERNEL_SOURCE_2 "pre_process_yuv422_scale",
|
||||
|
||||
typedef enum
|
||||
{
|
||||
COPY = 0,
|
||||
SCALE,
|
||||
TRANS
|
||||
} vsi_nn_kernel_convert_type_e;
|
||||
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_PRE_PROCESS_YUV422_KEY(_input0_type, _output_type, _convert_type) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8))
|
||||
|
||||
#define TENSOR_PRE_PROCESS_YUV422_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \
|
||||
{ HASH_PRE_PROCESS_YUV422_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE), \
|
||||
VX_KERNEL_NAME_PRE_PROCESS_YUV422_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \
|
||||
SOURCE },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
const char* source_name;
|
||||
} pre_process_yuv422_map[] =
|
||||
{
|
||||
TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I8, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I16, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, F16, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2)
|
||||
};
|
||||
|
||||
static vx_param_description_t vxPreProcessyuv422Kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _EVIS_PRE_PROCESS_YUV422_PARAM_NUM _cnt_of_array(vxPreProcessyuv422Kernel_param_def)
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t shaderParam = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
float output_zp = 0;
|
||||
float output_scale = 1;
|
||||
int32_t reorder = 0;
|
||||
int32_t order1 = 2;
|
||||
uint32_t width = 0;
|
||||
uint32_t height = 0;
|
||||
float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
|
||||
float outputScaleVar = 0.0f;
|
||||
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &rMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &gMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
output_scale = 1.0f / attr[0]->scale;
|
||||
output_zp = (float)attr[0]->zero_point;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
if (reorder != 0)
|
||||
{
|
||||
reorder = 2;
|
||||
order1 = 0;
|
||||
}
|
||||
|
||||
outputScaleVar = output_scale * var;
|
||||
bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
|
||||
gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
|
||||
rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
|
||||
|
||||
shaderParam.global_scale[0] = 4;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
|
||||
/ shaderParam.global_scale[0], 4);
|
||||
shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
|
||||
/ shaderParam.global_scale[1], 1);
|
||||
shaderParam.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x00003333, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x00000000, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertYUV422toB_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00120010, 0x00560054, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000,
|
||||
0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertYUV422toG_4x4 = {{
|
||||
0x29292929, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03120310, 0x07560754, // ABin
|
||||
0x2a2a2a2a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
|
||||
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertYUV422toR_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00320030, 0x00760074, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
|
||||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
|
||||
0x91919191, // TCfg
|
||||
0x40404040, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0xa2a2a2a2, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000700, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00010001, 0x00000001, 0x00010001,
|
||||
0x00000001, 0x00010001, 0x00000001, 0x00010001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toR_4x4", &uniConvertYUV422toR_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
|
||||
switch( attr[0]->dtype )
|
||||
{
|
||||
case U8:
|
||||
case I8:
|
||||
case I16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case F16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _pre_process_yuv422_copy_initializer() */
|
||||
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t shaderParam = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
float output_zp = 0;
|
||||
float output_scale = 1;
|
||||
int32_t reorder = 0;
|
||||
int32_t order1 = 2;
|
||||
uint32_t width = 0;
|
||||
uint32_t height = 0;
|
||||
uint32_t roi_width = 0;
|
||||
uint32_t roi_height = 0;
|
||||
uint32_t xrIntFloat_16 = 0;
|
||||
uint32_t yrIntFloat_16 = 0;
|
||||
int32_t xRatio = 0;
|
||||
int32_t yRatio = 0;
|
||||
float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
|
||||
float outputScaleVar = 0.0f;
|
||||
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &xRatio);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &yRatio);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &rMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &gMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
output_scale = 1.0f / attr[0]->scale;
|
||||
output_zp = (float)attr[0]->zero_point;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
if (reorder != 0)
|
||||
{
|
||||
reorder = 2;
|
||||
order1 = 0;
|
||||
}
|
||||
|
||||
roi_width = (xRatio * width) >> 15;
|
||||
roi_height = (yRatio * height) >> 15;
|
||||
xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
|
||||
yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
|
||||
|
||||
outputScaleVar = output_scale * var;
|
||||
bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
|
||||
gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
|
||||
rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
|
||||
|
||||
shaderParam.global_scale[0] = 4;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
|
||||
/ shaderParam.global_scale[0], 4);
|
||||
shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
|
||||
/ shaderParam.global_scale[1], 1);
|
||||
shaderParam.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertYUV422toB_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00110000, 0x00330022, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x3f323c00, 0x00000000, 0x3f323c00, 0x00000000,
|
||||
0x3f323c00, 0x00000000, 0x3f323c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertYUV422toG_4x4 = {{
|
||||
0x29292929, // TCfg
|
||||
0x14141414, // ASelt
|
||||
0x05110400, 0x07330622, // ABin
|
||||
0x2a2a2a2a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
|
||||
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertYUV422toR_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00510040, 0x00730062, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
|
||||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
|
||||
0x99999999, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001,
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toR_4x4", &uniConvertYUV422toR_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
switch( attr[0]->dtype )
|
||||
{
|
||||
case U8:
|
||||
case I8:
|
||||
case I16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case F16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _pre_process_yuv422_initializer() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
int32_t scale_x
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
vsi_nn_kernel_convert_type_e convert_type = SCALE;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
uint32_t key = 0;
|
||||
int i = 0;
|
||||
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (enable_copy)
|
||||
{
|
||||
convert_type = COPY;
|
||||
}
|
||||
else
|
||||
{
|
||||
convert_type = SCALE;
|
||||
}
|
||||
|
||||
key = HASH_PRE_PROCESS_YUV422_KEY( input0_dtype, output_dtype, convert_type );
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(pre_process_yuv422_map); i ++ )
|
||||
{
|
||||
if ( pre_process_yuv422_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(pre_process_yuv422_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_yuv422_map[i].function_name );
|
||||
kernel->info.parameters = vxPreProcessyuv422Kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( vxPreProcessyuv422Kernel_param_def );
|
||||
|
||||
if (convert_type == COPY)
|
||||
{
|
||||
kernel->info.initialize = _pre_process_yuv422_copy_initializer;
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel->info.initialize = _pre_process_yuv422_initializer;
|
||||
}
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
pre_process_yuv422_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
pre_process_yuv422_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV422_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
|
||||
int32_t trans = 0;
|
||||
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel, params, scale_x );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 2;
|
||||
int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
|
||||
int32_t left = vsi_nn_kernel_param_get_int32( params, "left" );
|
||||
int32_t top = vsi_nn_kernel_param_get_int32( params, "top" );
|
||||
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
|
||||
float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
|
||||
float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
|
||||
float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
|
||||
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
|
||||
int32_t yuv422_type = vsi_nn_kernel_param_get_int32( params, "yuv422_type" );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV422_PARAM_NUM,
|
||||
inputs, 1, outputs, 1 );
|
||||
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &yuv422_type );
|
||||
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV422_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[11] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[12] );
|
||||
}
|
||||
}
|
||||
vsi_safe_release_tensor(reshape_tensors[0]);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( pre_process_yuv422, _setup )
|
||||
|
||||
|
|
@ -361,7 +361,7 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
|
|||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_quantize_multiplier_16bit(input_scale * output_scale, &M0, &postShift);
|
||||
gpu_quantize_multiplier_16bit((double)input_scale * (double)output_scale, &M0, &postShift);
|
||||
|
||||
multAndoutZP[0] = (uint32_t)(M0);
|
||||
multAndoutZP[1] = (uint32_t)((outputZP << postShift) - inputZP * M0);
|
||||
|
|
|
|||
|
|
@ -1202,7 +1202,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_align_corners_opt_initializer)
|
|||
|
||||
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
|
||||
{
|
||||
is_8x_align_corners = (scale_factor[0] == scale_factor[1]) && (scale_factor[0] = 0.125f);
|
||||
is_8x_align_corners = (scale_factor[0] == scale_factor[1]) && (scale_factor[0] == 0.125f);
|
||||
}
|
||||
|
||||
if (is_8x_align_corners)
|
||||
|
|
@ -1595,6 +1595,37 @@ OnError:
|
|||
return scale;
|
||||
}
|
||||
|
||||
static vsi_bool _is_image_width_lt16
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t *input,
|
||||
int32_t pad_left,
|
||||
int32_t pad_right
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_dtype_e in_dtype = vsi_nn_kernel_map_dtype( input->attr.dtype.vx_type );
|
||||
vsi_size_t width = input->attr.size[0];
|
||||
size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype);
|
||||
vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8;
|
||||
|
||||
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (pad_left <= 0 || pad_right <= 0)
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (width + pad_left + pad_right > max_cross_read_img_width )
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
|
|
@ -1615,6 +1646,13 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_bool is_evis2 = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2);
|
||||
vsi_bool is_run_opt_kernel = FALSE;
|
||||
vsi_nn_tensor_t* scale = NULL;
|
||||
int32_t pad_left = half_pixel_centers ? 1 : 0;
|
||||
int32_t pad_right = half_pixel_centers ? 1 : 0;
|
||||
|
||||
if (_is_image_width_lt16(graph, inputs[0], pad_left, pad_right))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, is_same_type, is_evis2,
|
||||
align_corners, half_pixel_centers, &is_run_opt_kernel);
|
||||
|
|
|
|||
|
|
@ -371,7 +371,7 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
|
|||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_quantize_multiplier_16bit(input_scale * output_scale, &M0, &postShift);
|
||||
gpu_quantize_multiplier_16bit((double)input_scale * (double)output_scale, &M0, &postShift);
|
||||
|
||||
multAndoutZP[0] = (uint32_t)(M0);
|
||||
multAndoutZP[1] = (uint32_t)((outputZP << postShift) - inputZP * M0);
|
||||
|
|
|
|||
|
|
@ -34,7 +34,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -82,6 +81,15 @@ static const _kernel_map_type _select_kernel_map[] =
|
|||
PACK_KERNEL_MAP(I8, F16, I16, F16),
|
||||
PACK_KERNEL_MAP(I8, I16, F16, F16),
|
||||
PACK_KERNEL_MAP(I8, F16, F16, U8),
|
||||
PACK_KERNEL_MAP(I8, U8, F16, U8),
|
||||
PACK_KERNEL_MAP(I8, F16, U8, U8),
|
||||
PACK_KERNEL_MAP(I8, I8, F16, I8),
|
||||
PACK_KERNEL_MAP(I8, F16, I8, I8),
|
||||
PACK_KERNEL_MAP(I8, I16, F16, I16),
|
||||
PACK_KERNEL_MAP(I8, F16, I16, I16),
|
||||
PACK_KERNEL_MAP(I8, I8, I8, F16),
|
||||
PACK_KERNEL_MAP(I8, U8, U8, F16),
|
||||
PACK_KERNEL_MAP(I8, I16, I16, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, I8, I8, I8),
|
||||
PACK_KERNEL_MAP_2D(I8, U8, U8, U8),
|
||||
PACK_KERNEL_MAP_2D(I8, I16, I16, I16),
|
||||
|
|
@ -93,6 +101,15 @@ static const _kernel_map_type _select_kernel_map[] =
|
|||
PACK_KERNEL_MAP_2D(I8, F16, I16, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, I16, F16, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, F16, U8),
|
||||
PACK_KERNEL_MAP_2D(I8, U8, F16, U8),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, U8, U8),
|
||||
PACK_KERNEL_MAP_2D(I8, I8, F16, I8),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, I8, I8),
|
||||
PACK_KERNEL_MAP_2D(I8, I16, F16, I16),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, I16, I16),
|
||||
PACK_KERNEL_MAP_2D(I8, I8, I8, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, U8, U8, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, I16, I16, F16),
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -248,16 +265,26 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( I8, I8, I8 ):
|
||||
case _PACK_SELECT_KEY( I16, I16, I16 ):
|
||||
case _PACK_SELECT_KEY( U8, U8, U8 ):
|
||||
case _PACK_SELECT_KEY( I8, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( U8, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( I16, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, U8, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, I8, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, I16, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, F16, U8 ):
|
||||
case _PACK_SELECT_KEY( I8, I8, I8 ):
|
||||
case _PACK_SELECT_KEY( I16, I16, I16 ):
|
||||
case _PACK_SELECT_KEY( U8, U8, U8 ):
|
||||
case _PACK_SELECT_KEY( I8, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( U8, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( I16, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, U8, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, I8, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, I16, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, F16, U8 ):
|
||||
case _PACK_SELECT_KEY( U8, F16, U8 ):
|
||||
case _PACK_SELECT_KEY( F16, U8, U8 ):
|
||||
case _PACK_SELECT_KEY( I8, F16, I8 ):
|
||||
case _PACK_SELECT_KEY( F16, I8, I8 ):
|
||||
case _PACK_SELECT_KEY( I16, F16, I16 ):
|
||||
case _PACK_SELECT_KEY( F16, I16, I16 ):
|
||||
case _PACK_SELECT_KEY( I8, I8, F16 ):
|
||||
case _PACK_SELECT_KEY( I16, I16, F16 ):
|
||||
case _PACK_SELECT_KEY( U8, U8, F16 ):
|
||||
case _PACK_SELECT_KEY( BF16, BF16, BF16 ):
|
||||
{
|
||||
uint32_t multAndoutZP0[2] = {0};
|
||||
uint32_t multAndoutZP1[2] = {0};
|
||||
|
|
@ -367,9 +394,12 @@ static vsi_status _query_kernel
|
|||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
cond_dtype = (BOOL8 == cond_dtype || U8 == cond_dtype) ? I8 : cond_dtype;
|
||||
in0_dtype = (BOOL8 == in0_dtype) ? I8 : in0_dtype;
|
||||
in1_dtype = (BOOL8 == in1_dtype) ? I8 : in1_dtype;
|
||||
out_dtype = (BOOL8 == out_dtype) ? I8 : out_dtype;
|
||||
in0_dtype = (BOOL8 == in0_dtype) ? I8 : in0_dtype;
|
||||
in0_dtype = (BF16 == in0_dtype) ? I16 : in0_dtype;
|
||||
in1_dtype = (BOOL8 == in1_dtype) ? I8 : in1_dtype;
|
||||
in1_dtype = (BF16 == in1_dtype) ? I16 : in1_dtype;
|
||||
out_dtype = (BOOL8 == out_dtype) ? I8 : out_dtype;
|
||||
out_dtype = (BF16 == out_dtype) ? I16 : out_dtype;
|
||||
|
||||
key = SELECT_HASH_KEY(cond_dtype, in0_dtype, in1_dtype, out_dtype, image_2d);
|
||||
|
||||
|
|
@ -415,7 +445,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_bool image_2d = FALSE;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
|
|
@ -424,10 +454,10 @@ static vsi_nn_kernel_node_t _setup
|
|||
image_2d = (outputs[0]->attr.dim_num == 2);
|
||||
status = _query_kernel( kernel, inputs, outputs, image_2d);
|
||||
|
||||
if( VSI_SUCCESS == status)
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if( node )
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM,
|
||||
|
|
|
|||
|
|
@ -544,6 +544,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_size_t depthOut = new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1;
|
||||
vsi_size_t batchIn = new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1;
|
||||
|
||||
shapes[1][2] = shapes[1][2] == 0 ? 1 : shapes[1][2];
|
||||
shapes[1][3] = shapes[1][3] == 0 ? 1 : shapes[1][3];
|
||||
|
||||
vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM,
|
||||
&reshape_tensors[0], 1, &reshape_tensors[1], 1 );
|
||||
|
|
|
|||
|
|
@ -0,0 +1,797 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2021 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_sp_unit_operation.h"
|
||||
#include "kernel/vsi_nn_sp_lut.h"
|
||||
|
||||
#if (VX_STREAM_PROCESSOR_SUPPORT)
|
||||
|
||||
vsi_nn_spinst_t * vsi_nn_sp_moments_axis1_inst
|
||||
(
|
||||
vx_context context,
|
||||
int32_t fifo_depth,
|
||||
int32_t max_vector_depth
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
const int32_t spInitInstsNum = fifo_depth == 1 ? 4 : 3;
|
||||
const int32_t spLoopInstsNum = fifo_depth == 2 ? 4 : 3;
|
||||
const int32_t spCompleteInstsNum = fifo_depth == 1 ? 3 : 0;
|
||||
const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum + spCompleteInstsNum;
|
||||
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
vsi_nn_spinst_inst_param sp_insts_param[11];
|
||||
vsi_nn_spinst_attr_t attr;
|
||||
|
||||
memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
|
||||
vsi_nn_init_spinst_attr(&attr);
|
||||
|
||||
if (fifo_depth == 1)
|
||||
{
|
||||
/* init inst0: r3 = 0 */
|
||||
status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3);
|
||||
/* init inst1: r1 = 0 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR1);
|
||||
/* init inst2: r4 = 0 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4);
|
||||
/* init inst3: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[3]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
/* loop inst0: r5 = r1 * r1 || r1 = in */
|
||||
status = vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
|
||||
/* loop inst1: r3 = r3 + r1 || out = r1 */
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[5], VSI_NN_SP_SR3, VSI_NN_SP_SR1, VSI_NN_SP_SR3);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
|
||||
/* loop inst2: r5 = r5 + r4 */
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[6], VSI_NN_SP_SR5, VSI_NN_SP_SR4, VSI_NN_SP_SR5);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
/* complete inst0: v11 = r3 */
|
||||
status = vsi_nn_sp_move(&sp_insts_param[7], VSI_NN_SP_SR3, VSI_NN_SP_VR11);
|
||||
/* complete inst1: r3 = r3 + r1 || out = r1 */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[8]);
|
||||
/* complete inst2: v12 = r4 */
|
||||
status = vsi_nn_sp_move(&sp_insts_param[9], VSI_NN_SP_SR4, VSI_NN_SP_VR12);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.flush_cycle_num = 8;
|
||||
}
|
||||
else if (fifo_depth == 2)
|
||||
{
|
||||
/* init inst0: r3 = 0 */
|
||||
status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3);
|
||||
/* init inst1: r2 = 1 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR2);
|
||||
/* init inst2: r4 = 0 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
/* loop inst0: out = r2 * r1 || v11 = r1 + r3 | r1 = in */
|
||||
status = vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR2, VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_VR11);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
|
||||
/* loop inst1: v12 = r4 + r5 | r3 = v11 */
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_VR12);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR3);
|
||||
/* loop inst2: r4 = v12 */
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_VR12, VSI_NN_SP_SR4);
|
||||
/* loop inst3: r5 = r1 * r1 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.flush_cycle_num = 5;
|
||||
|
||||
attr.ignored_leading_v11_rd = fifo_depth;
|
||||
attr.ignored_leading_v12_rd = fifo_depth;
|
||||
attr.ignored_leading_v11_wr = 1;
|
||||
attr.ignored_leading_v12_wr = 1;
|
||||
|
||||
attr.num_of_v11_rd_in_flush_cycle = 1;
|
||||
attr.num_of_v12_rd_in_flush_cycle = 1;
|
||||
attr.num_of_v11_wr_in_flush_cycle = 1;
|
||||
attr.num_of_v12_wr_in_flush_cycle = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* init inst0: r3 = 0 */
|
||||
status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3);
|
||||
/* init inst1: r2 = 0 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR2);
|
||||
/* init inst2: r4 = 0 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
/* loop inst0: r5 = r1 * r1 | out = r2 + r1 || r1 = in */
|
||||
status = vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR2, VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
|
||||
/* loop inst1: v11 = r1 + r3 | r3 = v11 */
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_VR11);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR3);
|
||||
/* loop inst2: v12 = r4 + r5 | r4 = v12 */
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[5], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_VR12);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_VR12, VSI_NN_SP_SR4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.ignored_leading_v11_rd = fifo_depth;
|
||||
attr.ignored_leading_v12_rd = fifo_depth;
|
||||
attr.ignored_leading_v11_wr = 1;
|
||||
attr.ignored_leading_v12_wr = 1;
|
||||
|
||||
attr.num_of_v11_rd_in_flush_cycle = 1;
|
||||
attr.num_of_v12_rd_in_flush_cycle = 1;
|
||||
attr.num_of_v11_wr_in_flush_cycle = 2;
|
||||
attr.num_of_v12_wr_in_flush_cycle = 2;
|
||||
|
||||
attr.flush_cycle_num = 5;
|
||||
}
|
||||
|
||||
attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
|
||||
attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
|
||||
|
||||
attr.prog_init_instr_num = spInitInstsNum;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
attr.prog_complete_instr_num = spCompleteInstsNum;
|
||||
attr.ignored_leading_outputs = 1;
|
||||
attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
|
||||
attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
|
||||
|
||||
attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X;
|
||||
attr.split_tilex_equal_imgx = TRUE;
|
||||
attr.split_max_vector_depth = max_vector_depth;
|
||||
|
||||
spinst = vsi_nn_create_spinst_by_context(context);
|
||||
CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
|
||||
status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
|
||||
status |= vsi_nn_set_spinst_attr(spinst, attr);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
final:
|
||||
return spinst;
|
||||
}
|
||||
|
||||
DEF_SP_KERNEL_QUERY(moements_axis1_query)
|
||||
(
|
||||
vsi_nn_kernel_node_t node
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_size index = 0;
|
||||
vx_size tile_size[2] = {0};
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
int32_t fifo_depth = 0;
|
||||
int32_t max_vector_depth = 0;
|
||||
vx_context ctx = vxGetContext((vx_reference)node);
|
||||
vx_hardware_caps_params_ext2_t hw_param;
|
||||
|
||||
memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
|
||||
status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
fifo_depth = (int32_t)ceil((float)tile_size[0] / (float)hw_param.streamProcessorExecCount);
|
||||
max_vector_depth = hw_param.streamProcessorVectorSize;
|
||||
|
||||
spinst = vsi_nn_sp_moments_axis1_inst(ctx, fifo_depth, max_vector_depth);
|
||||
|
||||
status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_node_t vsi_nn_sp_moments_axis1_node
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t * input,
|
||||
vsi_nn_tensor_t * output0,
|
||||
vsi_nn_tensor_t * output1
|
||||
)
|
||||
{
|
||||
const uint32_t input_count = 1;
|
||||
const uint32_t output_count = 2;
|
||||
vx_tensor inputs_tensor[1] = {NULL};
|
||||
vx_tensor outputs_tensor[2] = {NULL};
|
||||
vx_node node = NULL;
|
||||
int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
|
||||
int32_t fifo_depth = 4;
|
||||
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
|
||||
spinst = vsi_nn_sp_moments_axis1_inst(graph->ctx->c, fifo_depth, max_vector_depth);
|
||||
|
||||
inputs_tensor[0] = input->t;
|
||||
outputs_tensor[0] = output0->t;
|
||||
outputs_tensor[1] = output1->t;
|
||||
node = vxStreamProcessorNode(
|
||||
graph->g,
|
||||
inputs_tensor,
|
||||
input_count,
|
||||
outputs_tensor,
|
||||
output_count,
|
||||
spinst->sp,
|
||||
NULL);
|
||||
|
||||
if (node)
|
||||
{
|
||||
vxAssignNodeQueryCallback(node, moements_axis1_query);
|
||||
}
|
||||
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_node_t vsi_nn_sp_ln_means_axis1_node
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t * input,
|
||||
vsi_nn_tensor_t * output,
|
||||
float inv_m,
|
||||
float const_a,
|
||||
float s,
|
||||
float eps,
|
||||
float output_scale
|
||||
)
|
||||
{
|
||||
const int32_t spInitInstsNum = 2;
|
||||
const int32_t spLoopInstsNum = 5;
|
||||
const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
|
||||
|
||||
const uint32_t input_count = 1;
|
||||
const uint32_t output_count = 1;
|
||||
vx_tensor inputs_tensor[1] = {NULL};
|
||||
vx_tensor outputs_tensor[1] = {NULL};
|
||||
vx_node node = NULL;
|
||||
int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
|
||||
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
vsi_nn_spinst_inst_param sp_insts_param[7];
|
||||
vsi_nn_spinst_attr_t attr;
|
||||
vsi_nn_sp_lut_params sp_lut_params;
|
||||
vx_lut_params_s vx_lut_params;
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
||||
memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
|
||||
vsi_nn_init_spinst_attr(&attr);
|
||||
memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params));
|
||||
memset(&vx_lut_params, 0, sizeof(vx_lut_params_s));
|
||||
|
||||
/* init inst0: r2 = const_a */
|
||||
status = vsi_nn_sp_move_constant(&sp_insts_param[0], const_a, VSI_NN_SP_SR2);
|
||||
/* init inst1: r3 = inv_m */
|
||||
status = vsi_nn_sp_move_constant(&sp_insts_param[1], inv_m, VSI_NN_SP_SR3);
|
||||
/* loop inst0: r4 = v11 * v11 || r6 = r4 + r5 || r5 = v11*/
|
||||
status = vsi_nn_sp_mul(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11, VSI_NN_SP_SR4);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[2], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_SR6);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_SR5);
|
||||
/* loop inst1: r1 = pwlMul() || r7 = pwlAdd() */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR1);
|
||||
status |= vsi_nn_sp_sub(&sp_insts_param[3], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR7);
|
||||
/* loop inst2: r5 = r2 * v12 || v12 = r8 + r7 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_VR12, VSI_NN_SP_SR5);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR8, VSI_NN_SP_SR7, VSI_NN_SP_VR12);
|
||||
/* loop inst3: r1 = setup(r6) || v11 = r3 * r5 || r7 = r1 */
|
||||
status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR6, VSI_NN_SP_SR1);
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR3, VSI_NN_SP_SR5, VSI_NN_SP_VR11);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR7);
|
||||
/* loop inst3: r8 = r1 * r7 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SR7, VSI_NN_SP_SR8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
|
||||
|
||||
attr.input_setup = VSI_NN_SP_INPUT_SETUP_V11;
|
||||
attr.prog_init_instr_num = spInitInstsNum;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
attr.ignored_leading_outputs = 0;
|
||||
attr.ignored_leading_v11_wr = 0;
|
||||
attr.ignored_leading_v12_wr = 3;
|
||||
attr.ignored_leading_v11_rd = 0;
|
||||
attr.flush_cycle_num = 17;
|
||||
|
||||
attr.num_of_v11_rd_in_flush_cycle = 0;
|
||||
attr.num_of_v12_rd_in_flush_cycle = 1;
|
||||
attr.num_of_v11_wr_in_flush_cycle = 1;
|
||||
attr.num_of_v12_wr_in_flush_cycle = 4;
|
||||
|
||||
attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X;
|
||||
attr.split_tilex_equal_imgx = TRUE;
|
||||
attr.split_max_vector_depth = max_vector_depth;
|
||||
|
||||
spinst = vsi_nn_create_spinst(graph);
|
||||
CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
|
||||
status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
|
||||
status |= vsi_nn_set_spinst_attr(spinst, attr);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
inputs_tensor[0] = input->t;
|
||||
outputs_tensor[0] = output->t;
|
||||
|
||||
vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM;
|
||||
vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
|
||||
vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
|
||||
|
||||
sp_lut_params.act_type = VSI_NN_SP_ACT_LINEAR_RSQRT;
|
||||
sp_lut_params.params[0] = s;
|
||||
sp_lut_params.params[1] = eps;
|
||||
sp_lut_params.params[2] = output_scale;
|
||||
vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params);
|
||||
|
||||
node = vxStreamProcessorNode(
|
||||
graph->g,
|
||||
inputs_tensor,
|
||||
input_count,
|
||||
outputs_tensor,
|
||||
output_count,
|
||||
spinst->sp,
|
||||
&vx_lut_params);
|
||||
|
||||
final:
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
if (vx_lut_params.in_lut)
|
||||
{
|
||||
vxReleaseLUT(&vx_lut_params.in_lut);
|
||||
vx_lut_params.in_lut = NULL;
|
||||
}
|
||||
if (vx_lut_params.out_lut)
|
||||
{
|
||||
vxReleaseLUT(&vx_lut_params.out_lut);
|
||||
vx_lut_params.out_lut = NULL;
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
}
|
||||
|
||||
vsi_nn_spinst_t * vsi_nn_sp_layer_norm_axis1_inst
|
||||
(
|
||||
vx_context context,
|
||||
int32_t fifo_depth,
|
||||
int32_t max_vector_depth
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
const int32_t spInitInstsNum = 0;
|
||||
const int32_t spLoopInstsNum = fifo_depth > 3 ? 2 : 5;
|
||||
const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
|
||||
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
vsi_nn_spinst_inst_param sp_insts_param[5];
|
||||
vsi_nn_spinst_attr_t attr;
|
||||
|
||||
memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
|
||||
vsi_nn_init_spinst_attr(&attr);
|
||||
|
||||
if (fifo_depth > 3)
|
||||
{
|
||||
/* loop inst0: out = in - v11 || v11 = v11 */
|
||||
status = vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
|
||||
/* loop inst1: out = r1 * v12 | v12 = v12 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[1], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.flush_cycle_num = 3;
|
||||
attr.ignored_leading_v12_rd = 1;
|
||||
attr.ignored_leading_v12_wr = 1;
|
||||
|
||||
attr.num_of_v11_rd_in_flush_cycle = 0;
|
||||
attr.num_of_v12_rd_in_flush_cycle = 2;
|
||||
attr.num_of_v11_wr_in_flush_cycle = 0;
|
||||
attr.num_of_v12_wr_in_flush_cycle = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* loop inst0: out = in - v11 || v11 = v11 */
|
||||
status = vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
|
||||
/* loop inst1: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[1]);
|
||||
/* loop inst2: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[2]);
|
||||
/* loop inst3: out = r1 * v12 | v12 = v12 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
|
||||
/* loop inst4: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[4]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.flush_cycle_num = 4;
|
||||
attr.ignored_leading_v12_rd = 0;
|
||||
attr.ignored_leading_v12_wr = 0;
|
||||
|
||||
attr.num_of_v11_rd_in_flush_cycle = 0;
|
||||
attr.num_of_v12_rd_in_flush_cycle = 1;
|
||||
attr.num_of_v11_wr_in_flush_cycle = 0;
|
||||
attr.num_of_v12_wr_in_flush_cycle = 1;
|
||||
}
|
||||
|
||||
attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
|
||||
attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
|
||||
|
||||
attr.prog_init_instr_num = spInitInstsNum;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
attr.ignored_leading_outputs = 0;
|
||||
attr.ignored_leading_v11_rd = 0;
|
||||
attr.ignored_leading_v11_wr = 0;
|
||||
|
||||
attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X;
|
||||
attr.split_tilex_equal_imgx = TRUE;
|
||||
attr.split_max_vector_depth = max_vector_depth;
|
||||
|
||||
spinst = vsi_nn_create_spinst_by_context(context);
|
||||
CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
|
||||
status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
|
||||
status |= vsi_nn_set_spinst_attr(spinst, attr);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
final:
|
||||
return spinst;
|
||||
}
|
||||
|
||||
DEF_SP_KERNEL_QUERY(layer_norm_axis1_query)
|
||||
(
|
||||
vsi_nn_kernel_node_t node
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_size index = 0;
|
||||
vx_size tile_size[2] = {0};
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
int32_t fifo_depth = 0;
|
||||
int32_t max_vector_depth = 0;
|
||||
vx_context ctx = vxGetContext((vx_reference)node);
|
||||
vx_hardware_caps_params_ext2_t hw_param;
|
||||
|
||||
memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
|
||||
status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
fifo_depth = (int32_t)ceil((float)tile_size[0] / (float)hw_param.streamProcessorExecCount);
|
||||
max_vector_depth = hw_param.streamProcessorVectorSize;
|
||||
|
||||
spinst = vsi_nn_sp_layer_norm_axis1_inst(ctx, fifo_depth, max_vector_depth);
|
||||
|
||||
status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_node_t vsi_nn_sp_layer_norm_axis1_node
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t * input0,
|
||||
vsi_nn_tensor_t * input1,
|
||||
vsi_nn_tensor_t * output
|
||||
)
|
||||
{
|
||||
const uint32_t input_count = 2;
|
||||
const uint32_t output_count = 1;
|
||||
vx_tensor inputs_tensor[2] = {NULL};
|
||||
vx_tensor outputs_tensor[1] = {NULL};
|
||||
vx_node node = NULL;
|
||||
int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
|
||||
int32_t fifo_depth = 4;
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
|
||||
spinst = vsi_nn_sp_layer_norm_axis1_inst(graph->ctx->c, fifo_depth, max_vector_depth);
|
||||
|
||||
inputs_tensor[0] = input0->t;
|
||||
inputs_tensor[1] = input1->t;
|
||||
outputs_tensor[0] = output->t;
|
||||
node = vxStreamProcessorNode(
|
||||
graph->g,
|
||||
inputs_tensor,
|
||||
input_count,
|
||||
outputs_tensor,
|
||||
output_count,
|
||||
spinst->sp,
|
||||
NULL);
|
||||
|
||||
if (node)
|
||||
{
|
||||
vxAssignNodeQueryCallback(node, layer_norm_axis1_query);
|
||||
}
|
||||
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_node_t vsi_nn_sp_load_weight_bias_node
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t * weight,
|
||||
vsi_nn_tensor_t * bias,
|
||||
vsi_nn_tensor_t * dummy_output
|
||||
)
|
||||
{
|
||||
const int32_t spLoopInstsNum = 2;
|
||||
const int32_t spInstsNum = spLoopInstsNum;
|
||||
|
||||
const uint32_t input_count = 2;
|
||||
const uint32_t output_count = 1;
|
||||
vx_tensor inputs_tensor[2] = {NULL};
|
||||
vx_tensor outputs_tensor[2] = {NULL};
|
||||
vx_node node = NULL;
|
||||
int32_t max_vector_depth = graph->ctx->config.sp_vector_depth /
|
||||
graph->ctx->config.sp_exec_count;
|
||||
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
vsi_nn_spinst_inst_param sp_insts_param[2];
|
||||
vsi_nn_spinst_attr_t attr;
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
||||
memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
|
||||
vsi_nn_init_spinst_attr(&attr);
|
||||
|
||||
/* loop inst0: v11 = in*/
|
||||
status = vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11);
|
||||
/* loop inst0: v12 = in*/
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_SRIN, VSI_NN_SP_VR12);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
|
||||
attr.input_setup = VSI_NN_SP_INPUT_SETUP_INTERLEAVE_TWO_INPUT;
|
||||
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
attr.ignored_leading_outputs = 0;
|
||||
attr.flush_cycle_num = 0;
|
||||
attr.ignored_leading_v11_rd = 0;
|
||||
attr.ignored_leading_v11_wr = 0;
|
||||
attr.ignored_leading_v12_rd = 0;
|
||||
attr.ignored_leading_v12_wr = 0;
|
||||
attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
|
||||
attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
|
||||
attr.ch0_post_redistribute = VSI_NN_SP_CH_POST_REDISTRIBUTE_VECTOR_GATHER;
|
||||
attr.ch1_post_redistribute = VSI_NN_SP_CH_POST_REDISTRIBUTE_VECTOR_GATHER;
|
||||
|
||||
attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ;
|
||||
attr.split_max_vector_depth = max_vector_depth;
|
||||
|
||||
spinst = vsi_nn_create_spinst(graph);
|
||||
CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
|
||||
status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
|
||||
status |= vsi_nn_set_spinst_attr(spinst, attr);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
inputs_tensor[0] = weight->t;
|
||||
inputs_tensor[1] = bias->t;
|
||||
outputs_tensor[0] = dummy_output->t;
|
||||
|
||||
node = vxStreamProcessorNode(
|
||||
graph->g,
|
||||
inputs_tensor,
|
||||
input_count,
|
||||
outputs_tensor,
|
||||
output_count,
|
||||
spinst->sp,
|
||||
NULL);
|
||||
|
||||
final:
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_node_t vsi_nn_sp_in_times_v11_plus_v12_node
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t * input,
|
||||
vsi_nn_tensor_t * dummy_tensor,
|
||||
vsi_nn_tensor_t * output
|
||||
)
|
||||
{
|
||||
const int32_t spLoopInstsNum = 1;
|
||||
const int32_t spInstsNum = spLoopInstsNum;
|
||||
|
||||
const uint32_t input_count = 2;
|
||||
const uint32_t output_count = 1;
|
||||
vx_tensor inputs_tensor[3] = {NULL};
|
||||
vx_tensor outputs_tensor[1] = {NULL};
|
||||
vx_node node = NULL;
|
||||
int32_t max_vector_depth = graph->ctx->config.sp_vector_depth /
|
||||
graph->ctx->config.sp_exec_count;
|
||||
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
vsi_nn_spinst_inst_param sp_insts_param[1];
|
||||
vsi_nn_spinst_attr_t attr;
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
||||
memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
|
||||
vsi_nn_init_spinst_attr(&attr);
|
||||
|
||||
/* loop inst0: r1 = in * v11 || out = r1 + v12 */
|
||||
status = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[0], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
|
||||
|
||||
attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
attr.ignored_leading_outputs = 3;
|
||||
attr.ignored_leading_v11_rd = 0;
|
||||
attr.ignored_leading_v12_rd = 3;
|
||||
attr.flush_cycle_num = 3;
|
||||
attr.v11_push_pop_config = VSI_NN_SP_PUSH_POP_EVERY_ROW;
|
||||
attr.v12_push_pop_config = VSI_NN_SP_PUSH_POP_EVERY_ROW;
|
||||
|
||||
attr.num_of_v11_rd_in_flush_cycle = 0;
|
||||
attr.num_of_v12_rd_in_flush_cycle = 3;
|
||||
|
||||
attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ;
|
||||
attr.split_max_vector_depth = max_vector_depth;
|
||||
|
||||
spinst = vsi_nn_create_spinst(graph);
|
||||
CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
|
||||
status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
|
||||
status |= vsi_nn_set_spinst_attr(spinst, attr);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
inputs_tensor[0] = input->t;
|
||||
inputs_tensor[1] = dummy_tensor->t;
|
||||
outputs_tensor[0] = output->t;
|
||||
node = vxStreamProcessorNode(
|
||||
graph->g,
|
||||
inputs_tensor,
|
||||
input_count,
|
||||
outputs_tensor,
|
||||
output_count,
|
||||
spinst->sp,
|
||||
NULL);
|
||||
|
||||
final:
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
}
|
||||
|
||||
/*
|
||||
** This program requires sum operation in the Y dimension.
|
||||
** Instead of using the SUM Engine, the sum needs to be performed
|
||||
** by Stream Processor instructions.
|
||||
*/
|
||||
vsi_nn_kernel_node_t layer_norm_y_direction
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
const vsi_nn_kernel_param_t * params
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_tensor_t * dummy_tensor[3] = {NULL};
|
||||
vsi_nn_tensor_t * output_tensor[2] = {NULL};
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
|
||||
float inv_m = 1.0f / (float)(outputs[0]->attr.size[0]);
|
||||
float s = inv_m * inv_m;
|
||||
float const_a = (float)(outputs[0]->attr.size[0]);
|
||||
|
||||
memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
attr.is_dummy = TRUE;
|
||||
attr.size[axis] = 1;
|
||||
dummy_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
|
||||
CHECK_PTR_FAIL_GOTO( dummy_tensor[0], "Create dummy_tensor fail.", final );
|
||||
dummy_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
|
||||
CHECK_PTR_FAIL_GOTO( dummy_tensor[1], "Create dummy_tensor fail.", final );
|
||||
memcpy( &attr.size, &inputs[2]->attr.size, sizeof(inputs[2]->attr.size) );
|
||||
attr.dim_num = inputs[2]->attr.dim_num;
|
||||
dummy_tensor[2] = vsi_nn_CreateTensor( graph, &attr );
|
||||
CHECK_PTR_FAIL_GOTO( dummy_tensor[2], "Create dummy_tensor fail.", final );
|
||||
|
||||
memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
output_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
|
||||
CHECK_PTR_FAIL_GOTO( output_tensor[0], "Create tensor fail.", final );
|
||||
output_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
|
||||
CHECK_PTR_FAIL_GOTO( output_tensor[1], "Create tensor fail.", final );
|
||||
|
||||
node = vsi_nn_sp_moments_axis1_node(graph, inputs[0], output_tensor[0], dummy_tensor[0]);
|
||||
CHECK_PTR_FAIL_GOTO( node, "Create sp_moments_axis1 fail.", final );
|
||||
node = vsi_nn_sp_ln_means_axis1_node(graph, dummy_tensor[0], dummy_tensor[1],
|
||||
inv_m, const_a, s, eps, output_scale);
|
||||
CHECK_PTR_FAIL_GOTO( node, "Create ln_y_dirction_means fail.", final );
|
||||
node = vsi_nn_sp_layer_norm_axis1_node(graph, output_tensor[0], dummy_tensor[1], output_tensor[1]);
|
||||
CHECK_PTR_FAIL_GOTO( node, "Create layer_norm_axis1 fail.", final );
|
||||
|
||||
node = vsi_nn_sp_load_weight_bias_node(graph, inputs[2], inputs[1], dummy_tensor[2]);
|
||||
CHECK_PTR_FAIL_GOTO( node, "Create mov_weight_bias fail.", final );
|
||||
node = vsi_nn_sp_in_times_v11_plus_v12_node(graph, output_tensor[1], dummy_tensor[2], outputs[0]);
|
||||
CHECK_PTR_FAIL_GOTO( node, "Create in_times_v11_plus_v12 fail.", final );
|
||||
|
||||
final:
|
||||
vsi_safe_release_tensor(dummy_tensor[0]);
|
||||
vsi_safe_release_tensor(dummy_tensor[1]);
|
||||
vsi_safe_release_tensor(dummy_tensor[2]);
|
||||
vsi_safe_release_tensor(output_tensor[0]);
|
||||
vsi_safe_release_tensor(output_tensor[1]);
|
||||
|
||||
return node;
|
||||
} /* layer_norm_y_direction() */
|
||||
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,938 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2021 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_sp_unit_operation.h"
|
||||
#include "kernel/vsi_nn_sp_lut.h"
|
||||
|
||||
#if (VX_STREAM_PROCESSOR_SUPPORT)
|
||||
|
||||
vsi_nn_spinst_t * vsi_nn_sp_max_axis2_inst
|
||||
(
|
||||
vx_context context,
|
||||
int32_t fifo_depth,
|
||||
int32_t max_vector_depth
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
const int32_t spInitInstsNum = 4;
|
||||
const int32_t spLoopInstsNum = fifo_depth > 4 ? 3 : 11;
|
||||
const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
|
||||
uint32_t f32_min = 0xff800000;
|
||||
float clampMin = *(float*)&f32_min;
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
vsi_nn_spinst_inst_param sp_insts_param[15];
|
||||
vsi_nn_spinst_attr_t attr;
|
||||
|
||||
memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
|
||||
vsi_nn_init_spinst_attr(&attr);
|
||||
|
||||
/* init inst0: r2 = -INF */
|
||||
status = vsi_nn_sp_move_constant(&sp_insts_param[0], clampMin, VSI_NN_SP_SR2);
|
||||
/* init inst1: r10 = 0 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR10);
|
||||
/* init inst2: r4 = 1 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 1, VSI_NN_SP_SR4);
|
||||
/* init inst3: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[3]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
if (fifo_depth > 4)
|
||||
{
|
||||
/* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */
|
||||
status = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_SR9);
|
||||
/* loop inst1: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8);
|
||||
status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5);
|
||||
status |= vsi_nn_sp_move_sel0(&sp_insts_param[5], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11);
|
||||
/* loop inst2: out = r1 */
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.flush_cycle_num = 7;
|
||||
|
||||
attr.ignored_leading_outputs = 1;
|
||||
attr.ignored_leading_v11_rd = fifo_depth;
|
||||
attr.ignored_leading_v11_wr = 2;
|
||||
|
||||
attr.num_of_v11_rd_in_flush_cycle = 0;
|
||||
attr.num_of_v11_wr_in_flush_cycle = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */
|
||||
status = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2);
|
||||
/* loop inst1: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[5]);
|
||||
/* loop inst2: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[6]);
|
||||
/* loop inst3: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[7], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8);
|
||||
status |= vsi_nn_sp_sub(&sp_insts_param[7], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[7], VSI_NN_SP_SR2, VSI_NN_SP_SR9);
|
||||
/* loop inst4: out = r1 */
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[8], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
|
||||
/* loop inst5: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[9]);
|
||||
/* loop inst6: nop */
|
||||
status |= vsi_nn_sp_move_sel0(&sp_insts_param[10], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11);
|
||||
/* loop inst7: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[11]);
|
||||
/* loop inst8: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[12]);
|
||||
/* loop inst9: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[13]);
|
||||
/* loop inst10: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[14]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.ignored_leading_outputs = 0;
|
||||
attr.ignored_leading_v11_rd = fifo_depth;
|
||||
attr.ignored_leading_v11_wr = 0;
|
||||
|
||||
attr.num_of_v11_rd_in_flush_cycle = 0;
|
||||
attr.num_of_v11_wr_in_flush_cycle = 1;
|
||||
|
||||
attr.flush_cycle_num = 10;
|
||||
}
|
||||
|
||||
attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
|
||||
attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
|
||||
|
||||
attr.prog_init_instr_num = spInitInstsNum;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
|
||||
|
||||
attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
|
||||
attr.split_tilex_equal_imgx = TRUE;
|
||||
attr.split_max_vector_depth = max_vector_depth;
|
||||
|
||||
spinst = vsi_nn_create_spinst_by_context(context);
|
||||
CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
|
||||
status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
|
||||
status |= vsi_nn_set_spinst_attr(spinst, attr);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
final:
|
||||
return spinst;
|
||||
}
|
||||
|
||||
DEF_SP_KERNEL_QUERY(max_axis2_query)
|
||||
(
|
||||
vsi_nn_kernel_node_t node
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_size index = 0;
|
||||
vx_size tile_size[2] = {0};
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
int32_t fifo_depth = 0;
|
||||
int32_t max_vector_depth = 0;
|
||||
vx_context ctx = vxGetContext((vx_reference)node);
|
||||
vx_hardware_caps_params_ext2_t hw_param;
|
||||
|
||||
memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
|
||||
status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1]) / (float)hw_param.streamProcessorExecCount);
|
||||
max_vector_depth = hw_param.streamProcessorVectorSize;
|
||||
|
||||
spinst = vsi_nn_sp_max_axis2_inst(ctx, fifo_depth, max_vector_depth);
|
||||
|
||||
status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_node_t vsi_nn_sp_max_axis2_node
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t * input,
|
||||
vsi_nn_tensor_t * output0,
|
||||
vsi_nn_tensor_t * output1
|
||||
)
|
||||
{
|
||||
const int32_t spInitInstsNum = 4;
|
||||
const int32_t spLoopInstsNum = 3;
|
||||
const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
|
||||
|
||||
const uint32_t input_count = 1;
|
||||
const uint32_t output_count = 2;
|
||||
vx_tensor inputs_tensor[1] = {NULL};
|
||||
vx_tensor outputs_tensor[2] = {NULL};
|
||||
vx_node node = NULL;
|
||||
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
vsi_nn_spinst_inst_param sp_insts_param[7];
|
||||
vsi_nn_spinst_attr_t attr;
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
|
||||
uint32_t f32_min = 0xff800000;
|
||||
float flt_min = *(float*)&f32_min;
|
||||
float input_scale = vsi_nn_get_tensor_scale(input);
|
||||
float clamp_min = 0;
|
||||
float clamp_max = 0;
|
||||
|
||||
memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
|
||||
vsi_nn_init_spinst_attr(&attr);
|
||||
|
||||
vsi_nn_get_tensor_clamp_min_max(input, &clamp_min, &clamp_max);
|
||||
clamp_min = clamp_min * input_scale;
|
||||
clamp_max = clamp_max * input_scale;
|
||||
|
||||
/* init inst0: r2 = -INF */
|
||||
status = vsi_nn_sp_move_constant(&sp_insts_param[0], flt_min, VSI_NN_SP_SR2);
|
||||
/* init inst1: r10 = 0 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR10);
|
||||
/* init inst2: r4 = 1 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 1, VSI_NN_SP_SR4);
|
||||
/* init inst3: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[3]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
/* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */
|
||||
status = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_SR9);
|
||||
/* loop inst1: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8);
|
||||
status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5);
|
||||
status |= vsi_nn_sp_move_sel0(&sp_insts_param[5], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11);
|
||||
/* loop inst2: out = r1 */
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
|
||||
attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
|
||||
|
||||
attr.prog_init_instr_num = spInitInstsNum;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
|
||||
|
||||
attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
|
||||
attr.split_tilex_equal_imgx = TRUE;
|
||||
attr.split_max_vector_depth = max_vector_depth;
|
||||
|
||||
attr.flush_cycle_num = 7;
|
||||
|
||||
attr.ignored_leading_outputs = 1;
|
||||
attr.ignored_leading_v11_rd = 5;
|
||||
attr.ignored_leading_v11_wr = 2;
|
||||
|
||||
attr.num_of_v11_rd_in_flush_cycle = 0;
|
||||
attr.num_of_v11_wr_in_flush_cycle = 3;
|
||||
|
||||
VSI_NN_SP_ATTR_SET_CONST_TO_SR3(attr, input_scale);
|
||||
VSI_NN_SP_ATTR_SET_CONST_TO_SR6(attr, clamp_max);
|
||||
VSI_NN_SP_ATTR_SET_CONST_TO_SR7(attr, clamp_min);
|
||||
|
||||
attr.prog_init_instr_num = spInitInstsNum;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
|
||||
spinst = vsi_nn_create_spinst(graph);
|
||||
CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
|
||||
status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
|
||||
status |= vsi_nn_set_spinst_attr(spinst, attr);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
inputs_tensor[0] = input->t;
|
||||
outputs_tensor[0] = output0->t;
|
||||
outputs_tensor[1] = output1->t;
|
||||
node = vxStreamProcessorNode(
|
||||
graph->g,
|
||||
inputs_tensor,
|
||||
input_count,
|
||||
outputs_tensor,
|
||||
output_count,
|
||||
spinst->sp,
|
||||
NULL);
|
||||
|
||||
final:
|
||||
|
||||
if (node)
|
||||
{
|
||||
vxAssignNodeQueryCallback(node, max_axis2_query);
|
||||
}
|
||||
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
}
|
||||
|
||||
vsi_nn_spinst_t * vsi_nn_sp_exp_y_direction_inst
|
||||
(
|
||||
vx_context context,
|
||||
int32_t fifo_depth,
|
||||
int32_t max_vector_depth
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
const int32_t spInitInstsNum = 2;
|
||||
const int32_t spLoopInstsNum = fifo_depth > 3 ? 4 : 8;
|
||||
const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
vsi_nn_spinst_inst_param sp_insts_param[10];
|
||||
vsi_nn_spinst_attr_t attr;
|
||||
|
||||
memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
|
||||
vsi_nn_init_spinst_attr(&attr);
|
||||
|
||||
/* init inst0: r8 = 0 */
|
||||
status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR8);
|
||||
/* init inst1: r9 = 1 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR9);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
if (fifo_depth > 3)
|
||||
{
|
||||
/* loop inst0: r2 = in - v11 | v11 = v11 */
|
||||
status = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
|
||||
/* loop inst1: r8 = v12 * r9 | r7 = r4 + r6 | out = r7 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR7, VSI_NN_SP_SROUT);
|
||||
/* loop inst2: r6 = r5 * r2 | v12 = r7 + r8 | r4 = r3 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SR4);
|
||||
/* loop inst3: r1 = setup(r2) | r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/
|
||||
status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1);
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
|
||||
status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.flush_cycle_num = 18;
|
||||
|
||||
attr.ignored_leading_outputs = 4;
|
||||
attr.ignored_leading_v11_rd = 0;
|
||||
attr.ignored_leading_v11_wr = 0;
|
||||
attr.ignored_leading_v12_rd = fifo_depth + 3;
|
||||
attr.ignored_leading_v12_wr = 4;
|
||||
|
||||
attr.num_of_v12_rd_in_flush_cycle = 4;
|
||||
attr.num_of_v12_wr_in_flush_cycle = 5;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* loop inst0: r2 = in - v11 | v11 = v11 */
|
||||
status = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
|
||||
/* loop inst1: r6 = r5 * r2 | r4 = r3 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR3, VSI_NN_SP_SR4);
|
||||
/* loop inst2: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[4]);
|
||||
/* loop inst3: r1 = setup(r2) */
|
||||
status = vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1);
|
||||
/* loop inst4: r8 = v12 * r9 | r7 = r4 + r6 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[6], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
|
||||
/* loop inst5: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[7]);
|
||||
/* loop inst6: r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[8], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
|
||||
status |= vsi_nn_sp_sub(&sp_insts_param[8], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[8], VSI_NN_SP_SR1, VSI_NN_SP_SR3);
|
||||
/* loop inst7: v12 = r7 + r8 | out = r7 */
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[9], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[9], VSI_NN_SP_SR7, VSI_NN_SP_SROUT);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.ignored_leading_outputs = 1;
|
||||
attr.ignored_leading_v11_rd = 0;
|
||||
attr.ignored_leading_v11_wr = 0;
|
||||
attr.ignored_leading_v12_rd = fifo_depth + 1;
|
||||
attr.ignored_leading_v12_wr = 1;
|
||||
|
||||
attr.num_of_v12_rd_in_flush_cycle = 2;
|
||||
attr.num_of_v12_wr_in_flush_cycle = 2;
|
||||
|
||||
attr.flush_cycle_num = 15;
|
||||
}
|
||||
|
||||
attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
|
||||
attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
|
||||
|
||||
attr.prog_init_instr_num = spInitInstsNum;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
|
||||
|
||||
attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
|
||||
attr.split_tilex_equal_imgx = TRUE;
|
||||
attr.split_max_vector_depth = max_vector_depth;
|
||||
|
||||
spinst = vsi_nn_create_spinst_by_context(context);
|
||||
CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
|
||||
status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
|
||||
status |= vsi_nn_set_spinst_attr(spinst, attr);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
final:
|
||||
return spinst;
|
||||
}
|
||||
|
||||
DEF_SP_KERNEL_QUERY(softmax_z_direction_exp_query)
|
||||
(
|
||||
vsi_nn_kernel_node_t node
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_size index = 0;
|
||||
vx_size tile_size[2] = {0};
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
int32_t fifo_depth = 0;
|
||||
int32_t max_vector_depth = 0;
|
||||
vx_context ctx = vxGetContext((vx_reference)node);
|
||||
vx_hardware_caps_params_ext2_t hw_param;
|
||||
|
||||
memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
|
||||
status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1])/ (float)hw_param.streamProcessorExecCount);
|
||||
max_vector_depth = hw_param.streamProcessorVectorSize;
|
||||
|
||||
spinst = vsi_nn_sp_exp_y_direction_inst(ctx, fifo_depth, max_vector_depth);
|
||||
|
||||
status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_node_t vsi_nn_sp_softmax_z_direction_exp_node
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t * input0,
|
||||
vsi_nn_tensor_t * input1,
|
||||
vsi_nn_tensor_t * output0,
|
||||
vsi_nn_tensor_t * output1,
|
||||
float beta
|
||||
)
|
||||
{
|
||||
const int32_t spInitInstsNum = 2;
|
||||
const int32_t spLoopInstsNum = 4;
|
||||
const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
|
||||
|
||||
const uint32_t input_count = 2;
|
||||
const uint32_t output_count = 2;
|
||||
vx_tensor inputs_tensor[2] = {NULL};
|
||||
vx_tensor outputs_tensor[2] = {NULL};
|
||||
vx_node node = NULL;
|
||||
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
vsi_nn_spinst_inst_param sp_insts_param[6];
|
||||
vsi_nn_spinst_attr_t attr;
|
||||
|
||||
vsi_nn_sp_lut_params sp_lut_params;
|
||||
vx_lut_params_s vx_lut_params;
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
int32_t fifo_depth = 4;
|
||||
int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
|
||||
|
||||
memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
|
||||
vsi_nn_init_spinst_attr(&attr);
|
||||
memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params));
|
||||
memset(&vx_lut_params, 0, sizeof(vx_lut_params_s));
|
||||
|
||||
/* init inst0: r8 = 0 */
|
||||
status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR8);
|
||||
/* init inst1: r9 = 1 */
|
||||
status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR9);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
/* loop inst0: r2 = in - v11 | v11 = v11 */
|
||||
status = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
|
||||
/* loop inst1: r8 = v12 * r9 | r7 = r4 + r6 | out = r7 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR7, VSI_NN_SP_SROUT);
|
||||
/* loop inst2: r6 = r5 * r2 | v12 = r7 + r8 | r4 = r3 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SR4);
|
||||
/* loop inst3: r1 = setup(r2) | r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/
|
||||
status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1);
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
|
||||
status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.flush_cycle_num = 18;
|
||||
|
||||
attr.ignored_leading_outputs = 4;
|
||||
attr.ignored_leading_v11_rd = 0;
|
||||
attr.ignored_leading_v11_wr = 0;
|
||||
attr.ignored_leading_v12_rd = fifo_depth + 3;
|
||||
attr.ignored_leading_v12_wr = 4;
|
||||
|
||||
attr.num_of_v12_rd_in_flush_cycle = 4;
|
||||
attr.num_of_v12_wr_in_flush_cycle = 5;
|
||||
|
||||
attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
|
||||
attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
|
||||
|
||||
attr.prog_init_instr_num = spInitInstsNum;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
|
||||
|
||||
attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
|
||||
attr.split_tilex_equal_imgx = TRUE;
|
||||
attr.split_max_vector_depth = max_vector_depth;
|
||||
|
||||
attr.prog_init_instr_num = spInitInstsNum;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
|
||||
spinst = vsi_nn_create_spinst(graph);
|
||||
CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
|
||||
status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
|
||||
status |= vsi_nn_set_spinst_attr(spinst, attr);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM;
|
||||
vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
|
||||
vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
|
||||
|
||||
sp_lut_params.act_type = VSI_NN_SP_ACT_LINEAR_EXP;
|
||||
sp_lut_params.params[0] = beta;
|
||||
sp_lut_params.params[1] = 0;
|
||||
vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params);
|
||||
|
||||
inputs_tensor[0] = input0->t;
|
||||
inputs_tensor[1] = input1->t;
|
||||
outputs_tensor[0] = output0->t;
|
||||
outputs_tensor[1] = output1->t;
|
||||
node = vxStreamProcessorNode(
|
||||
graph->g,
|
||||
inputs_tensor,
|
||||
input_count,
|
||||
outputs_tensor,
|
||||
output_count,
|
||||
spinst->sp,
|
||||
&vx_lut_params);
|
||||
|
||||
final:
|
||||
if (node)
|
||||
{
|
||||
vxAssignNodeQueryCallback(node, softmax_z_direction_exp_query);
|
||||
}
|
||||
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
if (vx_lut_params.in_lut)
|
||||
{
|
||||
vxReleaseLUT(&vx_lut_params.in_lut);
|
||||
vx_lut_params.in_lut = NULL;
|
||||
}
|
||||
|
||||
if (vx_lut_params.out_lut)
|
||||
{
|
||||
vxReleaseLUT(&vx_lut_params.out_lut);
|
||||
vx_lut_params.out_lut = NULL;
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
}
|
||||
vsi_nn_kernel_node_t vsi_nn_sp_rcp_node
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t * input,
|
||||
vsi_nn_tensor_t * output,
|
||||
float output_scale
|
||||
)
|
||||
{
|
||||
const int32_t spLoopInstsNum = 3;
|
||||
const int32_t spInstsNum = spLoopInstsNum;
|
||||
|
||||
const uint32_t input_count = 1;
|
||||
const uint32_t output_count = 1;
|
||||
vx_tensor inputs_tensor[1] = {NULL};
|
||||
vx_tensor outputs_tensor[1] = {NULL};
|
||||
vx_node node = NULL;
|
||||
int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
|
||||
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
vsi_nn_spinst_inst_param sp_insts_param[3];
|
||||
vsi_nn_spinst_attr_t attr;
|
||||
|
||||
vsi_nn_sp_lut_params sp_lut_params;
|
||||
vx_lut_params_s vx_lut_params;
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
||||
memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
|
||||
vsi_nn_init_spinst_attr(&attr);
|
||||
memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params));
|
||||
memset(&vx_lut_params, 0, sizeof(vx_lut_params_s));
|
||||
|
||||
/* loop inst0: r1 = pwlSetup(v12) | r5 = pwlMul() | r2 = pwlAdd() | r8 = r1 */
|
||||
status = vsi_nn_sp_pwl_setup0(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SR1);
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
|
||||
status |= vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_SR1, VSI_NN_SP_SR8);
|
||||
/* loop inst1: r6 = r5 * r2 | r7 = r4 + r6 | r4 = r8 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[1], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
|
||||
status |= vsi_nn_sp_add(&sp_insts_param[1], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_SR4, VSI_NN_SP_SR8);
|
||||
/* loop inst1: v12 = r7 * r3 */
|
||||
status |= vsi_nn_sp_mul(&sp_insts_param[2], VSI_NN_SP_SR7, VSI_NN_SP_SR3, VSI_NN_SP_VR12);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
|
||||
|
||||
attr.input_setup = VSI_NN_SP_INPUT_SETUP_V12;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
attr.ignored_leading_v12_wr = 4;
|
||||
attr.ignored_leading_v12_rd = 0;
|
||||
attr.flush_cycle_num = 14;
|
||||
|
||||
attr.num_of_v12_wr_in_flush_cycle = 5;
|
||||
|
||||
attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ;
|
||||
attr.split_max_vector_depth = max_vector_depth;
|
||||
|
||||
VSI_NN_SP_ATTR_SET_CONST_TO_SR3(attr, 1.0f / output_scale);
|
||||
|
||||
spinst = vsi_nn_create_spinst(graph);
|
||||
CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
|
||||
status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
|
||||
status |= vsi_nn_set_spinst_attr(spinst, attr);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
inputs_tensor[0] = input->t;
|
||||
outputs_tensor[0] = output->t;
|
||||
|
||||
vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM;
|
||||
vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
|
||||
vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
|
||||
|
||||
sp_lut_params.act_type = VSI_NN_SP_ACT_RCP;
|
||||
vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params);
|
||||
|
||||
node = vxStreamProcessorNode(
|
||||
graph->g,
|
||||
inputs_tensor,
|
||||
input_count,
|
||||
outputs_tensor,
|
||||
output_count,
|
||||
spinst->sp,
|
||||
&vx_lut_params);
|
||||
|
||||
final:
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
if (vx_lut_params.in_lut)
|
||||
{
|
||||
vxReleaseLUT(&vx_lut_params.in_lut);
|
||||
vx_lut_params.in_lut = NULL;
|
||||
}
|
||||
if (vx_lut_params.out_lut)
|
||||
{
|
||||
vxReleaseLUT(&vx_lut_params.out_lut);
|
||||
vx_lut_params.out_lut = NULL;
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
}
|
||||
|
||||
vsi_nn_spinst_t * vsi_nn_sp_times_inst
|
||||
(
|
||||
vx_context context,
|
||||
int32_t fifo_depth,
|
||||
int32_t max_vector_depth
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
const int32_t spInitInstsNum = 0;
|
||||
const int32_t spLoopInstsNum = fifo_depth > 4 ? 1 : fifo_depth > 1 ? 3 : 5;
|
||||
const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
vsi_nn_spinst_inst_param sp_insts_param[5];
|
||||
vsi_nn_spinst_attr_t attr;
|
||||
|
||||
memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
|
||||
vsi_nn_init_spinst_attr(&attr);
|
||||
|
||||
if (fifo_depth > 4)
|
||||
{
|
||||
/* loop inst0: out = v12 * in | v12 = v12 */
|
||||
status = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (fifo_depth > 1)
|
||||
{
|
||||
/* loop inst0: out = v12 * in | v12 = v12 */
|
||||
status = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
|
||||
/* loop inst1: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[1]);
|
||||
/* loop inst2: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[2]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
/* loop inst0: out = v12 * in | v12 = v12 */
|
||||
status = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT);
|
||||
status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
|
||||
/* loop inst1: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[1]);
|
||||
/* loop inst2: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[2]);
|
||||
/* loop inst3: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[3]);
|
||||
/* loop inst4: nop */
|
||||
status |= vsi_nn_sp_nop(&sp_insts_param[4]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
|
||||
attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
|
||||
|
||||
attr.prog_init_instr_num = spInitInstsNum;
|
||||
attr.prog_loop_instr_num = spLoopInstsNum;
|
||||
|
||||
attr.flush_cycle_num = 0;
|
||||
|
||||
attr.ignored_leading_outputs = 0;
|
||||
attr.ignored_leading_v11_rd = 0;
|
||||
attr.ignored_leading_v11_wr = 0;
|
||||
|
||||
attr.num_of_v11_rd_in_flush_cycle = 0;
|
||||
attr.num_of_v11_wr_in_flush_cycle = 0;
|
||||
|
||||
attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
|
||||
attr.split_tilex_equal_imgx = TRUE;
|
||||
attr.split_max_vector_depth = max_vector_depth;
|
||||
|
||||
spinst = vsi_nn_create_spinst_by_context(context);
|
||||
CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
|
||||
status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
|
||||
status |= vsi_nn_set_spinst_attr(spinst, attr);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
final:
|
||||
return spinst;
|
||||
}
|
||||
|
||||
DEF_SP_KERNEL_QUERY(times_query)
|
||||
(
|
||||
vsi_nn_kernel_node_t node
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_size index = 0;
|
||||
vx_size tile_size[2] = {0};
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
int32_t fifo_depth = 0;
|
||||
int32_t max_vector_depth = 0;
|
||||
vx_context ctx = vxGetContext((vx_reference)node);
|
||||
vx_hardware_caps_params_ext2_t hw_param;
|
||||
|
||||
memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
|
||||
status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1]) / (float)hw_param.streamProcessorExecCount);
|
||||
max_vector_depth = hw_param.streamProcessorVectorSize;
|
||||
|
||||
spinst = vsi_nn_sp_times_inst(ctx, fifo_depth, max_vector_depth);
|
||||
|
||||
status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_node_t vsi_nn_sp_softmax_z_direction_times_node
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t * input0,
|
||||
vsi_nn_tensor_t * input1,
|
||||
vsi_nn_tensor_t * output
|
||||
)
|
||||
{
|
||||
const uint32_t input_count = 2;
|
||||
const uint32_t output_count = 1;
|
||||
vx_tensor inputs_tensor[2] = {NULL, NULL};
|
||||
vx_tensor outputs_tensor[1] = {NULL};
|
||||
vx_node node = NULL;
|
||||
int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
|
||||
int32_t fifo_depth = 5;
|
||||
|
||||
vsi_nn_spinst_t *spinst = NULL;
|
||||
|
||||
spinst = vsi_nn_sp_times_inst(graph->ctx->c, fifo_depth, max_vector_depth);
|
||||
|
||||
inputs_tensor[0] = input0->t;
|
||||
inputs_tensor[1] = input1->t;
|
||||
outputs_tensor[0] = output->t;
|
||||
node = vxStreamProcessorNode(
|
||||
graph->g,
|
||||
inputs_tensor,
|
||||
input_count,
|
||||
outputs_tensor,
|
||||
output_count,
|
||||
spinst->sp,
|
||||
NULL);
|
||||
|
||||
if (node)
|
||||
{
|
||||
vxAssignNodeQueryCallback(node, times_query);
|
||||
}
|
||||
|
||||
if (spinst)
|
||||
{
|
||||
vsi_nn_release_spinst(&spinst);
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
}
|
||||
|
||||
/*
|
||||
** This program requires sum operation in the z dimension.
|
||||
** Instead of using the SUM Engine, the sum needs to be performed
|
||||
** by Stream Processor instructions.
|
||||
*/
|
||||
vsi_nn_kernel_node_t softmax_z_direction
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
const vsi_nn_kernel_param_t * params
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_tensor_t * dummy_tensor[3] = {NULL};
|
||||
vsi_nn_tensor_t * output_tensor[2] = {NULL};
|
||||
int32_t axis = 2;
|
||||
float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
|
||||
float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
|
||||
memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
attr.is_dummy = TRUE;
|
||||
attr.size[axis] = 1;
|
||||
dummy_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
|
||||
CHECK_PTR_FAIL_GOTO( dummy_tensor[0], "Create dummy_tensor fail.", final );
|
||||
dummy_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
|
||||
CHECK_PTR_FAIL_GOTO( dummy_tensor[1], "Create dummy_tensor fail.", final );
|
||||
dummy_tensor[2] = vsi_nn_CreateTensor( graph, &attr );
|
||||
CHECK_PTR_FAIL_GOTO( dummy_tensor[2], "Create dummy_tensor fail.", final );
|
||||
|
||||
memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
output_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
|
||||
CHECK_PTR_FAIL_GOTO( output_tensor[0], "Create tensor fail.", final );
|
||||
output_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
|
||||
CHECK_PTR_FAIL_GOTO( output_tensor[1], "Create tensor fail.", final );
|
||||
|
||||
node = vsi_nn_sp_max_axis2_node(graph, inputs[0], output_tensor[0], dummy_tensor[0]);
|
||||
CHECK_PTR_FAIL_GOTO( node, "Create sp_max_axis2 fail.", final );
|
||||
node = vsi_nn_sp_softmax_z_direction_exp_node(graph, output_tensor[0], dummy_tensor[0],
|
||||
output_tensor[1], dummy_tensor[1], beta);
|
||||
CHECK_PTR_FAIL_GOTO( node, "Create exp_y_direction fail.", final );
|
||||
node = vsi_nn_sp_rcp_node(graph, dummy_tensor[1], dummy_tensor[2], output_scale);
|
||||
CHECK_PTR_FAIL_GOTO( node, "Create sp_rcp fail.", final );
|
||||
node = vsi_nn_sp_softmax_z_direction_times_node(graph, output_tensor[1], dummy_tensor[2], outputs[0]);
|
||||
CHECK_PTR_FAIL_GOTO( node, "Create softmax_times fail.", final );
|
||||
|
||||
final:
|
||||
vsi_safe_release_tensor(dummy_tensor[0]);
|
||||
vsi_safe_release_tensor(dummy_tensor[1]);
|
||||
vsi_safe_release_tensor(dummy_tensor[2]);
|
||||
vsi_safe_release_tensor(output_tensor[0]);
|
||||
vsi_safe_release_tensor(output_tensor[1]);
|
||||
|
||||
return node;
|
||||
} /* softmax_z_direction() */
|
||||
|
||||
#endif
|
||||
|
|
@ -35,6 +35,7 @@
|
|||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "utils/vsi_nn_math.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
|
||||
#include "libnnext/vsi_nn_libnnext_resource.h"
|
||||
#if VSI_USE_VXC_BINARY
|
||||
|
|
@ -118,7 +119,14 @@ static void _kernel_clear_source
|
|||
|
||||
static vsi_bool _check_shader_support(vsi_nn_graph_t* graph);
|
||||
|
||||
static vsi_bool vsi_nn_kernel_is_asymmtric_int8
|
||||
static vsi_bool _check_stream_process_support
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
vsi_nn_tensor_t** inputs,
|
||||
size_t input_num
|
||||
);
|
||||
|
||||
vsi_bool vsi_nn_kernel_is_supported_types
|
||||
(
|
||||
vsi_nn_tensor_t** inputs,
|
||||
size_t input_num,
|
||||
|
|
@ -1222,7 +1230,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
|
|||
/* Skip evis and cl when disable shader */
|
||||
if ( (type == VSI_NN_KERNEL_TYPE_EVIS || type == VSI_NN_KERNEL_TYPE_CL)
|
||||
&& ( _check_shader_support(graph) == FALSE ||
|
||||
vsi_nn_kernel_is_asymmtric_int8(inputs, input_num, outputs, output_num) ) )
|
||||
vsi_nn_kernel_is_supported_types(inputs, input_num, outputs, output_num) == FALSE ) )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
|
@ -1234,8 +1242,8 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
|
|||
}
|
||||
|
||||
/* Skip StreamProcesor if not support */
|
||||
if( type == VSI_NN_KERNEL_TYPE_SP
|
||||
&& !graph->ctx->config.support_stream_processor )
|
||||
if( type == VSI_NN_KERNEL_TYPE_SP &&
|
||||
_check_stream_process_support(graph, inputs, input_num) == FALSE )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
|
@ -1661,7 +1669,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
static vsi_bool vsi_nn_kernel_is_asymmtric_int8
|
||||
vsi_bool vsi_nn_kernel_is_supported_types
|
||||
(
|
||||
vsi_nn_tensor_t** inputs,
|
||||
size_t input_num,
|
||||
|
|
@ -1673,25 +1681,45 @@ static vsi_bool vsi_nn_kernel_is_asymmtric_int8
|
|||
|
||||
for (i = 0; i < input_num; i++)
|
||||
{
|
||||
if ( inputs[i] &&
|
||||
inputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
|
||||
inputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
|
||||
)
|
||||
if ( inputs[i] && vsi_nn_TypeGetBits(inputs[i]->attr.dtype.vx_type) == 4 )
|
||||
{
|
||||
return TRUE;
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < output_num; i++)
|
||||
{
|
||||
if ( outputs[i] &&
|
||||
outputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
|
||||
outputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
|
||||
)
|
||||
if ( outputs[i] && vsi_nn_TypeGetBits(outputs[i]->attr.dtype.vx_type) == 4 )
|
||||
{
|
||||
return TRUE;
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static vsi_bool _check_stream_process_support
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
vsi_nn_tensor_t** inputs,
|
||||
size_t input_num
|
||||
)
|
||||
{
|
||||
if ( graph->ctx->config.support_stream_processor == 0 )
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if ( graph->ctx->config.sp_exec_count == 0 )
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (inputs && input_num > 0 &&
|
||||
inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
|
@ -653,4 +653,61 @@ vsi_bool vsi_nn_kernel_optimize_group_norm_shape
|
|||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape
|
||||
(
|
||||
const vsi_size_t* shape_x, const vsi_size_t rank_x, const int32_t axis,
|
||||
vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, vsi_size_t max_size
|
||||
)
|
||||
{
|
||||
vsi_bool ret = TRUE;
|
||||
vsi_size_t i = 0;
|
||||
vsi_size_t rank_in = 0;
|
||||
vsi_size_t dims = 0;
|
||||
vsi_size_t innerSize = 1;
|
||||
vsi_size_t outerSize = 1;
|
||||
vsi_size_t axisSize = shape_x[axis];
|
||||
|
||||
for (i = 0; i < (size_t)axis; i++)
|
||||
{
|
||||
innerSize *= shape_x[i];
|
||||
}
|
||||
|
||||
for (i = axis + 1; i < rank_x; i++)
|
||||
{
|
||||
outerSize *= shape_x[i];
|
||||
}
|
||||
|
||||
rank_in += element_fill_dim(out_shape_x, rank_in, max_size, innerSize);
|
||||
dims = element_fill_dim(out_shape_x, rank_in, max_size, axisSize);
|
||||
if (dims == 0)
|
||||
{
|
||||
*out_axis = (int32_t)rank_in;
|
||||
out_shape_x[rank_in ++] = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
*out_axis = (int32_t)rank_in;
|
||||
}
|
||||
|
||||
rank_in += dims;
|
||||
|
||||
rank_in += element_fill_dim(out_shape_x, rank_in, max_size, outerSize);
|
||||
|
||||
if ( 0 == rank_in )
|
||||
{
|
||||
out_shape_x[0] = 1;
|
||||
out_shape_x[1] = 1;
|
||||
rank_in = 2;
|
||||
}
|
||||
else if ( 1 == rank_in )
|
||||
{
|
||||
out_shape_x[1] = 1;
|
||||
rank_in = 2;
|
||||
}
|
||||
|
||||
*out_rank_x = (uint32_t)rank_in;
|
||||
|
||||
return ret;
|
||||
} /* vsi_nn_kernel_optimize_scatter_elements_shape() */
|
||||
|
|
|
|||
|
|
@ -199,6 +199,31 @@ static float softsign_eval(float x)
|
|||
return x / (1 + vsi_abs(x));
|
||||
}
|
||||
|
||||
static float linear_exp_eval(float x, vsi_nn_kernel_lut_params *lut_param)
|
||||
{
|
||||
float a = lut_param->params[0];
|
||||
float b = lut_param->params[1];
|
||||
|
||||
return expf(x * a + b);
|
||||
}
|
||||
|
||||
static float linear_rsqrt_eval(float x, vsi_nn_kernel_lut_params *lut_param)
|
||||
{
|
||||
float a = lut_param->params[0];
|
||||
float b = lut_param->params[1];
|
||||
float scale = lut_param->params[2];
|
||||
|
||||
return scale / sqrtf(a * x + b);
|
||||
}
|
||||
|
||||
static float linear_sigmoid_eval(float x, vsi_nn_kernel_lut_params *lut_param)
|
||||
{
|
||||
float a = lut_param->params[0];
|
||||
float b = lut_param->params[1];
|
||||
|
||||
return 1.0f / (1 + expf(a * x + b));;
|
||||
}
|
||||
|
||||
static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param)
|
||||
{
|
||||
float result = 0;
|
||||
|
|
@ -261,6 +286,15 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
|
|||
case VSI_NN_KERNEL_LUT_SOFTSIGN:
|
||||
result = softsign_eval(data);
|
||||
break;
|
||||
case VSI_NN_KERNEL_LUT_LINEAR_EXP:
|
||||
result = linear_exp_eval(data, lut_param);
|
||||
break;
|
||||
case VSI_NN_KERNEL_LUT_LINEAR_RSQRT:
|
||||
result = linear_rsqrt_eval(data, lut_param);
|
||||
break;
|
||||
case VSI_NN_KERNEL_LUT_LINEAR_SIGMOID:
|
||||
result = linear_sigmoid_eval(data, lut_param);
|
||||
break;
|
||||
default:
|
||||
VSILOGE( "unsupported activation function:%d", lut_param->act_type );
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -43,7 +43,8 @@ static vsi_bool _build_vx_conv2d_param
|
|||
int32_t dilation_h, int32_t dilation_w,
|
||||
int32_t multiplier,
|
||||
vsi_enum overflow_policy, vsi_enum rounding_policy,
|
||||
vsi_enum down_scale_size_rounding
|
||||
vsi_enum down_scale_size_rounding,
|
||||
vsi_enum pad_mode
|
||||
)
|
||||
{
|
||||
vx_nn_convolution_params_ext_t * p1 = NULL;
|
||||
|
|
@ -78,6 +79,7 @@ static vsi_bool _build_vx_conv2d_param
|
|||
p1->khr.down_scale_size_rounding = (vx_enum)down_scale_size_rounding;
|
||||
p1->padding_x_right = (uint32_t)pad_w_end;
|
||||
p1->padding_y_bottom = (uint32_t)pad_h_end;
|
||||
p1->pad_mode = (vx_enum)pad_mode;
|
||||
param->depth_multiplier = multiplier;
|
||||
param->stride_x = (uint32_t)stride_w;
|
||||
param->stride_y = (uint32_t)stride_h;
|
||||
|
|
@ -131,7 +133,8 @@ static vsi_bool _build_vx_conv3d_param
|
|||
int32_t dilation_d, int32_t dilation_h, int32_t dilation_w,
|
||||
int32_t multiplier,
|
||||
vsi_enum overflow_policy, vsi_enum rounding_policy,
|
||||
vsi_enum down_scale_size_rounding
|
||||
vsi_enum down_scale_size_rounding,
|
||||
vsi_enum pad_mode
|
||||
)
|
||||
{
|
||||
VSI_ASSERT( stride_d > 0 );
|
||||
|
|
@ -176,6 +179,7 @@ static vsi_bool _build_vx_conv3d_param
|
|||
param->stride_w = (uint32_t)stride_w;
|
||||
param->stride_h = (uint32_t)stride_h;
|
||||
param->stride_d = (uint32_t)stride_d;
|
||||
param->pad_mode = (vx_enum)pad_mode;
|
||||
|
||||
return TRUE;
|
||||
} /* _build_vx_conv2d_param() */
|
||||
|
|
@ -299,7 +303,8 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
|
|||
0,
|
||||
vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
|
||||
vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
|
||||
vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
|
||||
vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"),
|
||||
vsi_nn_kernel_param_get_int32(params, "pad_mode")
|
||||
);
|
||||
|
||||
temp_tensors[0] = _expand_tensor_dim( inputs[0]->t,
|
||||
|
|
@ -374,7 +379,8 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
|
|||
vsi_nn_kernel_param_get_int32(params, "multiplier"),
|
||||
vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
|
||||
vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
|
||||
vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
|
||||
vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"),
|
||||
vsi_nn_kernel_param_get_int32(params, "pad_mode")
|
||||
);
|
||||
|
||||
temp_tensors[0] = _expand_tensor_dim( inputs[0]->t,
|
||||
|
|
@ -493,7 +499,8 @@ REGISTER_CONV_OPENVX_KERNEL( conv2d )
|
|||
0,
|
||||
vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
|
||||
vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
|
||||
vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
|
||||
vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"),
|
||||
vsi_nn_kernel_param_get_int32(params, "pad_mode")
|
||||
);
|
||||
|
||||
node = vxConvolutionLayer( graph->g,
|
||||
|
|
@ -524,7 +531,8 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv2d )
|
|||
vsi_nn_kernel_param_get_int32(params, "multiplier"),
|
||||
vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
|
||||
vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
|
||||
vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
|
||||
vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"),
|
||||
vsi_nn_kernel_param_get_int32(params, "pad_mode")
|
||||
);
|
||||
|
||||
node = vxConvolutionLayer( graph->g,
|
||||
|
|
@ -606,7 +614,8 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d )
|
|||
vsi_nn_kernel_param_get_int32(params, "depth_multiplier"),
|
||||
vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
|
||||
vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
|
||||
vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
|
||||
vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"),
|
||||
vsi_nn_kernel_param_get_int32(params, "pad_mode")
|
||||
);
|
||||
|
||||
node = vxConv3dLayer( graph->g,
|
||||
|
|
|
|||
|
|
@ -269,4 +269,84 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( tanh )
|
|||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* tanh() */
|
||||
|
||||
REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu1 )
|
||||
{
|
||||
vx_node node = NULL;
|
||||
|
||||
node = vxActivationLayer(
|
||||
graph->g,
|
||||
inputs[0]->t,
|
||||
VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU1,
|
||||
0,
|
||||
0,
|
||||
outputs[0]->t
|
||||
);
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* relu1() */
|
||||
|
||||
REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu6 )
|
||||
{
|
||||
vx_node node = NULL;
|
||||
|
||||
node = vxActivationLayer(
|
||||
graph->g,
|
||||
inputs[0]->t,
|
||||
VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU6,
|
||||
0,
|
||||
0,
|
||||
outputs[0]->t
|
||||
);
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* relu6() */
|
||||
|
||||
REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( rsqrt )
|
||||
{
|
||||
vx_node node = NULL;
|
||||
|
||||
node = vxActivationLayer(
|
||||
graph->g,
|
||||
inputs[0]->t,
|
||||
VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RSQRT,
|
||||
0,
|
||||
0,
|
||||
outputs[0]->t
|
||||
);
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* rsqrt() */
|
||||
|
||||
REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sqrt )
|
||||
{
|
||||
vx_node node = NULL;
|
||||
|
||||
node = vxActivationLayer(
|
||||
graph->g,
|
||||
inputs[0]->t,
|
||||
VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SQRT,
|
||||
0,
|
||||
0,
|
||||
outputs[0]->t
|
||||
);
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* sqrt() */
|
||||
|
||||
REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( softrelu )
|
||||
{
|
||||
vx_node node = NULL;
|
||||
|
||||
node = vxActivationLayer(
|
||||
graph->g,
|
||||
inputs[0]->t,
|
||||
VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SOFTRELU,
|
||||
0,
|
||||
0,
|
||||
outputs[0]->t
|
||||
);
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* softrelu() */
|
||||
|
||||
#undef REGISTER_ELTWISE_UNARY_OPENVX_KERNEL
|
||||
|
|
|
|||
|
|
@ -65,6 +65,7 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
|
|||
int32_t pad_front_array[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t pad_back_array[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
vsi_nn_tensor_t *convert_tensor = NULL;
|
||||
vsi_bool release_intermediate_tensor = TRUE;
|
||||
float const_val = vsi_nn_kernel_param_get_float32(params, "const_val");
|
||||
|
||||
memset(¶m, 0, sizeof(param));
|
||||
|
|
@ -98,14 +99,18 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
|
|||
}
|
||||
else
|
||||
{
|
||||
convert_tensor = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num );
|
||||
convert_tensor = inputs[0];
|
||||
release_intermediate_tensor = FALSE;
|
||||
}
|
||||
|
||||
node = vxTensorPadNode( graph->g, convert_tensor->t, outputs[0]->t, ¶m, sizeof(param) );
|
||||
|
||||
vxReleaseScalar( ¶m.pad_const );
|
||||
vsi_safe_release_tensor(convert_tensor);
|
||||
|
||||
if (release_intermediate_tensor)
|
||||
{
|
||||
vsi_safe_release_tensor(convert_tensor);
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* pad2() */
|
||||
|
|
|
|||
|
|
@ -0,0 +1,281 @@
|
|||
#pragma OPENCL EXTENSION CL_VIV_asm : enable
|
||||
|
||||
#define BUCKETIZE_F32_2D_SH_IMPL(name, comp_op) \
|
||||
__kernel void bucketize_##name \
|
||||
( \
|
||||
__read_only image2d_t input, \
|
||||
__read_only image2d_t boundaries, \
|
||||
__write_only image2d_t output, \
|
||||
int boundaries_size, \
|
||||
float input0_scale, \
|
||||
float input0_tail, \
|
||||
float input1_scale, \
|
||||
float input1_tail \
|
||||
) \
|
||||
{ \
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
|
||||
\
|
||||
float4 src0 = read_imagef(input, coord); \
|
||||
\
|
||||
int2 pos = 0; \
|
||||
do \
|
||||
{ \
|
||||
float4 src1 = read_imagef(boundaries, pos); \
|
||||
if ((src0.x) comp_op (src1.x)) \
|
||||
{ \
|
||||
break; \
|
||||
} \
|
||||
pos.x ++; \
|
||||
} while(pos.x < boundaries_size); \
|
||||
\
|
||||
write_imagei(output, coord, pos.xxxx); \
|
||||
}
|
||||
BUCKETIZE_F32_2D_SH_IMPL(F32_F32toI32_2D, <=)
|
||||
BUCKETIZE_F32_2D_SH_IMPL(right_F32_F32toI32_2D, <)
|
||||
|
||||
#define BUCKETIZE_F32_SH_IMPL(name, comp_op) \
|
||||
__kernel void bucketize_##name \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_t boundaries, \
|
||||
__write_only image2d_array_t output, \
|
||||
int boundaries_size, \
|
||||
float input0_scale, \
|
||||
float input0_tail, \
|
||||
float input1_scale, \
|
||||
float input1_tail \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
float4 src0 = read_imagef(input, coord); \
|
||||
\
|
||||
int2 pos = 0; \
|
||||
do \
|
||||
{ \
|
||||
float4 src1 = read_imagef(boundaries, pos); \
|
||||
if ((src0.x) comp_op (src1.x)) \
|
||||
{ \
|
||||
break; \
|
||||
} \
|
||||
pos.x ++; \
|
||||
} while(pos.x < boundaries_size); \
|
||||
\
|
||||
write_imagei(output, coord, pos.xxxx); \
|
||||
}
|
||||
BUCKETIZE_F32_SH_IMPL(F32_F32toI32, <=)
|
||||
BUCKETIZE_F32_SH_IMPL(right_F32_F32toI32, <)
|
||||
|
||||
#define BUCKETIZE_I32_2D_SH_IMPL(name, comp_op) \
|
||||
__kernel void bucketize_##name \
|
||||
( \
|
||||
__read_only image2d_t input, \
|
||||
__read_only image2d_t boundaries, \
|
||||
__write_only image2d_t output, \
|
||||
int boundaries_size, \
|
||||
float input0_scale, \
|
||||
float input0_tail, \
|
||||
float input1_scale, \
|
||||
float input1_tail \
|
||||
) \
|
||||
{ \
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
|
||||
\
|
||||
float4 src0 = convert_float4(read_imagei(input, coord)); \
|
||||
\
|
||||
int2 pos = 0; \
|
||||
src0 = src0 * input0_scale + input0_tail; \
|
||||
do \
|
||||
{ \
|
||||
float4 src1 = convert_float4(read_imagei(boundaries, pos)); \
|
||||
src1 = src1 * input1_scale + input1_tail; \
|
||||
if ((src0.x) comp_op (src1.x)) \
|
||||
{ \
|
||||
break; \
|
||||
} \
|
||||
pos.x ++; \
|
||||
} while(pos.x < boundaries_size); \
|
||||
\
|
||||
write_imagei(output, coord, pos.xxxx); \
|
||||
}
|
||||
BUCKETIZE_I32_2D_SH_IMPL(I32_I32toI32_2D, <=)
|
||||
BUCKETIZE_I32_2D_SH_IMPL(right_I32_I32toI32_2D, <)
|
||||
|
||||
#define BUCKETIZE_I32_SH_IMPL(name, comp_op) \
|
||||
__kernel void bucketize_##name \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_t boundaries, \
|
||||
__write_only image2d_array_t output, \
|
||||
int boundaries_size, \
|
||||
float input0_scale, \
|
||||
float input0_tail, \
|
||||
float input1_scale, \
|
||||
float input1_tail \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
int4 data = read_imagei(input, coord); \
|
||||
float4 src0 = convert_float4(data) * input0_scale + input0_tail; \
|
||||
\
|
||||
int2 pos = 0; \
|
||||
do \
|
||||
{ \
|
||||
float4 src1 = convert_float4(read_imagei(boundaries, pos)); \
|
||||
src1 = src1 * input1_scale + input1_tail; \
|
||||
if ((src0.x) comp_op (src1.x)) \
|
||||
{ \
|
||||
break; \
|
||||
} \
|
||||
pos.x ++; \
|
||||
} while(pos.x < boundaries_size); \
|
||||
\
|
||||
write_imagei(output, coord, pos.xxxx); \
|
||||
}
|
||||
BUCKETIZE_I32_SH_IMPL(I32_I32toI32, <=)
|
||||
BUCKETIZE_I32_SH_IMPL(right_I32_I32toI32, <)
|
||||
|
||||
#define BUCKETIZE_U32_2D_SH_IMPL(name, comp_op) \
|
||||
__kernel void bucketize_##name \
|
||||
( \
|
||||
__read_only image2d_t input, \
|
||||
__read_only image2d_t boundaries, \
|
||||
__write_only image2d_t output, \
|
||||
int boundaries_size, \
|
||||
float input0_scale, \
|
||||
float input0_tail, \
|
||||
float input1_scale, \
|
||||
float input1_tail \
|
||||
) \
|
||||
{ \
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
|
||||
\
|
||||
float4 src0 = convert_float4(read_imageui(input, coord)); \
|
||||
\
|
||||
int2 pos = 0; \
|
||||
src0 = src0 * input0_scale + input0_tail; \
|
||||
do \
|
||||
{ \
|
||||
float4 src1 = convert_float4(read_imageui(boundaries, pos)); \
|
||||
src1 = src1 * input1_scale + input1_tail; \
|
||||
if ((src0.x) comp_op (src1.x)) \
|
||||
{ \
|
||||
break; \
|
||||
} \
|
||||
pos.x ++; \
|
||||
} while(pos.x < boundaries_size); \
|
||||
\
|
||||
write_imagei(output, coord, pos.xxxx); \
|
||||
}
|
||||
BUCKETIZE_U32_2D_SH_IMPL(U32_U32toI32_2D, <=)
|
||||
BUCKETIZE_U32_2D_SH_IMPL(right_U32_U32toI32_2D, <)
|
||||
|
||||
#define BUCKETIZE_U32_SH_IMPL(name, comp_op) \
|
||||
__kernel void bucketize_##name \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_t boundaries, \
|
||||
__write_only image2d_array_t output, \
|
||||
int boundaries_size, \
|
||||
float input0_scale, \
|
||||
float input0_tail, \
|
||||
float input1_scale, \
|
||||
float input1_tail \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
uint4 data = read_imageui(input, coord); \
|
||||
float4 src0 = convert_float4(data) * input0_scale + input0_tail; \
|
||||
\
|
||||
int2 pos = 0; \
|
||||
do \
|
||||
{ \
|
||||
float4 src1 = convert_float4(read_imageui(boundaries, pos)); \
|
||||
src1 = src1 * input1_scale + input1_tail; \
|
||||
if ((src0.x) comp_op (src1.x)) \
|
||||
{ \
|
||||
break; \
|
||||
} \
|
||||
pos.x ++; \
|
||||
} while(pos.x < boundaries_size); \
|
||||
\
|
||||
write_imagei(output, coord, pos.xxxx); \
|
||||
}
|
||||
BUCKETIZE_U32_SH_IMPL(U32_U32toI32, <=)
|
||||
BUCKETIZE_U32_SH_IMPL(right_U32_U32toI32, <)
|
||||
|
||||
#define BUCKETIZE_BF16_2D_SH_IMPL(name, comp_op) \
|
||||
__kernel void bucketize_##name \
|
||||
( \
|
||||
__read_only image2d_t input, \
|
||||
__read_only image2d_t boundaries, \
|
||||
__write_only image2d_t output, \
|
||||
int boundaries_size, \
|
||||
float input0_scale, \
|
||||
float input0_tail, \
|
||||
float input1_scale, \
|
||||
float input1_tail \
|
||||
) \
|
||||
{ \
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
|
||||
\
|
||||
uint4 data0 = read_imageui(input, coord) << 16; \
|
||||
float4 src0; \
|
||||
_viv_asm(COPY, src0, data0, 16); \
|
||||
\
|
||||
int2 pos = 0; \
|
||||
do \
|
||||
{ \
|
||||
uint4 data1 = read_imageui(boundaries, pos) << 16; \
|
||||
float4 src1; \
|
||||
_viv_asm(COPY, src1, data1, 16); \
|
||||
if ((src0.x) comp_op (src1.x)) \
|
||||
{ \
|
||||
break; \
|
||||
} \
|
||||
pos.x ++; \
|
||||
} while(pos.x < boundaries_size); \
|
||||
\
|
||||
write_imagei(output, coord, pos.xxxx); \
|
||||
}
|
||||
BUCKETIZE_BF16_2D_SH_IMPL(BF16_BF16toI32_2D, <=)
|
||||
BUCKETIZE_BF16_2D_SH_IMPL(right_BF16_BF16toI32_2D, <)
|
||||
|
||||
#define BUCKETIZE_BF16_SH_IMPL(name, comp_op) \
|
||||
__kernel void bucketize_##name \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_t boundaries, \
|
||||
__write_only image2d_array_t output, \
|
||||
int boundaries_size, \
|
||||
float input0_scale, \
|
||||
float input0_tail, \
|
||||
float input1_scale, \
|
||||
float input1_tail \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
uint4 data0 = read_imageui(input, coord) << 16; \
|
||||
float4 src0; \
|
||||
_viv_asm(COPY, src0, data0, 16); \
|
||||
\
|
||||
int2 pos = 0; \
|
||||
do \
|
||||
{ \
|
||||
uint4 data1 = read_imageui(boundaries, pos) << 16; \
|
||||
float4 src1; \
|
||||
_viv_asm(COPY, src1, data1, 16); \
|
||||
if ((src0.x) comp_op (src1.x)) \
|
||||
{ \
|
||||
break; \
|
||||
} \
|
||||
pos.x ++; \
|
||||
} while(pos.x < boundaries_size); \
|
||||
\
|
||||
write_imagei(output, coord, pos.xxxx); \
|
||||
}
|
||||
BUCKETIZE_BF16_SH_IMPL(BF16_BF16toI32, <=)
|
||||
BUCKETIZE_BF16_SH_IMPL(right_BF16_BF16toI32, <)
|
||||
|
|
@ -0,0 +1,115 @@
|
|||
|
||||
#define LPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type) \
|
||||
int gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
int hstart = gidy * stride_y - pad_top; \
|
||||
int wstart = gidx * stride_x - pad_left; \
|
||||
int hend = min(hstart + ksize_y, height); \
|
||||
int wend = min(wstart + ksize_x, width); \
|
||||
int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0); \
|
||||
int4 coord_in = coord_out; \
|
||||
int h, w; \
|
||||
float sum_of_pow = 0; \
|
||||
dst_type out_data = (dst_type)(0); \
|
||||
src_type in_data; \
|
||||
float in_f32, out_f32; \
|
||||
hstart = max(hstart, 0); \
|
||||
wstart = max(wstart, 0); \
|
||||
for (h = hstart; h < hend; h++) \
|
||||
{ \
|
||||
for (w = wstart; w < wend; w++) \
|
||||
{ \
|
||||
coord_in.xy = (int2)(w, h); \
|
||||
in_data = readimage_type(input, coord_in).x; \
|
||||
in_f32 = convert_float(in_data) * inputScale + inputTail; \
|
||||
sum_of_pow += pow(fabs(in_f32),p); \
|
||||
} \
|
||||
} \
|
||||
out_f32 = pow(sum_of_pow, 1.0f / p) * outputScale + outputTail; \
|
||||
out_data.x = conv_mode(out_f32); \
|
||||
writeimage_type(output, coord_out, out_data); \
|
||||
|
||||
#define TENSOR_LPPOOL(src_name, dst_name, src_type, dst_type, readimage_type, conv_mode, writeimage_type) \
|
||||
__kernel void lppool_##src_name##to##dst_name ( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
int ksize_x, \
|
||||
int ksize_y, \
|
||||
int stride_x, \
|
||||
int stride_y, \
|
||||
int pad_left, \
|
||||
int pad_top, \
|
||||
int p, \
|
||||
int width, \
|
||||
int height, \
|
||||
float inputScale, \
|
||||
float inputTail, \
|
||||
float outputScale, \
|
||||
float outputTail) \
|
||||
{ \
|
||||
LPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type); \
|
||||
}
|
||||
|
||||
TENSOR_LPPOOL(F32, F32, float, float4, read_imagef, convert_float, write_imagef)
|
||||
TENSOR_LPPOOL(F32, U32, float, uint4, read_imagef, convert_uint, write_imageui)
|
||||
TENSOR_LPPOOL(F32, I32, float, int4, read_imagef, convert_int, write_imagei)
|
||||
|
||||
TENSOR_LPPOOL(U32, U32, uint, uint4, read_imageui, convert_uint, write_imageui)
|
||||
TENSOR_LPPOOL(U32, F32, uint, float4, read_imageui, convert_float, write_imagef)
|
||||
TENSOR_LPPOOL(U32, I32, uint, int4, read_imageui, convert_int, write_imagei)
|
||||
|
||||
TENSOR_LPPOOL(I32, I32, int, int4, read_imagei, convert_int, write_imagei)
|
||||
TENSOR_LPPOOL(I32, F32, int, float4, read_imagei, convert_float, write_imagef)
|
||||
TENSOR_LPPOOL(I32, U32, int, uint4, read_imagei, convert_uint, write_imageui)
|
||||
|
||||
__kernel void lppool_BF16toBF16(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int ksize_x,
|
||||
int ksize_y,
|
||||
int stride_x,
|
||||
int stride_y,
|
||||
int pad_left,
|
||||
int pad_top,
|
||||
int p,
|
||||
int width,
|
||||
int height,
|
||||
float inputScale,
|
||||
float inputTail,
|
||||
float outputScale,
|
||||
float outputTail)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
int hstart = gidy * stride_y - pad_top;
|
||||
int wstart = gidx * stride_x - pad_left;
|
||||
int hend = min(hstart + ksize_y, height);
|
||||
int wend = min(wstart + ksize_x, width);
|
||||
int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);
|
||||
int4 coord_in = coord_out;
|
||||
int h, w;
|
||||
float sum_of_pow = 0;
|
||||
float out_data_f32 = 0;
|
||||
uint4 dst = (uint4)(0);
|
||||
float4 data_f32 = (float4)(0);
|
||||
uint4 data;
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
|
||||
for (h = hstart; h < hend; h++)
|
||||
{
|
||||
for (w = wstart; w < wend; w++)
|
||||
{
|
||||
coord_in.xy = (int2)(w, h);
|
||||
data = read_imageui(input, coord_in);
|
||||
data = data << 16;
|
||||
_viv_asm(COPY, data_f32, data, 16);
|
||||
sum_of_pow += pow(abs(data_f32.x),p);
|
||||
}
|
||||
}
|
||||
out_data_f32 = pow(sum_of_pow, 1.0f / p);
|
||||
_viv_asm(COPY, dst, out_data_f32, 4);
|
||||
dst.x = dst.x >> 16;
|
||||
write_imageui(output, coord_out, dst);
|
||||
}
|
||||
|
||||
|
|
@ -124,7 +124,7 @@ __kernel void maximum_I32I32toI32
|
|||
float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
|
||||
float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
|
||||
float4 data = data0 > data1 ? data0 : data1;
|
||||
int4 dst = convert_int4(data * outputScale + outputZP);
|
||||
int4 dst = convert_int4_rte(data * outputScale + outputZP);
|
||||
|
||||
write_imagei(output, coord, dst);
|
||||
}
|
||||
|
|
@ -150,7 +150,7 @@ __kernel void maximum_I32I32toI32_2D
|
|||
float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
|
||||
float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
|
||||
float4 data = data0 > data1 ? data0 : data1;
|
||||
int4 dst = convert_int4(data * outputScale + outputZP);
|
||||
int4 dst = convert_int4_rte(data * outputScale + outputZP);
|
||||
|
||||
write_imagei(output, coord, dst);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -124,7 +124,7 @@ __kernel void minimum_I32I32toI32
|
|||
float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
|
||||
float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
|
||||
float4 data = data0 < data1 ? data0 : data1;
|
||||
int4 dst = convert_int4(data * outputScale + outputZP);
|
||||
int4 dst = convert_int4_rte(data * outputScale + outputZP);
|
||||
|
||||
write_imagei(output, coord, dst);
|
||||
}
|
||||
|
|
@ -150,7 +150,7 @@ __kernel void minimum_I32I32toI32_2D
|
|||
float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
|
||||
float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
|
||||
float4 data = data0 < data1 ? data0 : data1;
|
||||
int4 dst = convert_int4(data * outputScale + outputZP);
|
||||
int4 dst = convert_int4_rte(data * outputScale + outputZP);
|
||||
|
||||
write_imagei(output, coord, dst);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,12 +1,14 @@
|
|||
|
||||
inline float roi_align_1x1
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
float2 region_start,
|
||||
float2 region_end,
|
||||
float2 bin_size,
|
||||
int2 grid_size,
|
||||
float2 rcp_of_grid_size,
|
||||
int pz
|
||||
float2 region_start,
|
||||
float2 region_end,
|
||||
float2 bin_size,
|
||||
int2 grid_size,
|
||||
float2 rcp_of_grid_size,
|
||||
int pz,
|
||||
int4 max_spatial_dims
|
||||
)
|
||||
{
|
||||
float sum = 0;
|
||||
|
|
@ -21,15 +23,24 @@ inline float roi_align_1x1
|
|||
int2 xy_low = convert_int2(pos);
|
||||
int2 xy_high = xy_low + 1;
|
||||
|
||||
float ly = pos.y - xy_low.y;
|
||||
float lx = pos.x - xy_low.x;
|
||||
float hy = 1.0f - ly;
|
||||
float hx = 1.0f - lx;
|
||||
if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||
|
||||
xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
float2 lxy = pos - floor(pos);
|
||||
float2 zero = 0;
|
||||
|
||||
lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy;
|
||||
|
||||
float hy = 1.0f - lxy.y;
|
||||
float hx = 1.0f - lxy.x;
|
||||
|
||||
float w1 = hy * hx;
|
||||
float w2 = hy * lx;
|
||||
float w3 = ly * hx;
|
||||
float w4 = ly * lx;
|
||||
float w2 = lxy.x - lxy.x * lxy.y;
|
||||
float w3 = lxy.y - lxy.x * lxy.y;
|
||||
float w4 = lxy.y * lxy.x;
|
||||
|
||||
float data1 = read_imagef(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;
|
||||
float data2 = read_imagef(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;
|
||||
|
|
@ -43,8 +54,9 @@ inline float roi_align_1x1
|
|||
return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);
|
||||
}
|
||||
|
||||
|
||||
#define EPS_GRID 0.00001f
|
||||
#define TYPE_FLOAT16 (1)
|
||||
#define TYPE_FLOAT32 (2)
|
||||
__kernel void roi_align_F32_F32toF32
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
|
|
@ -57,13 +69,14 @@ __kernel void roi_align_F32_F32toF32
|
|||
float output_zp,
|
||||
float spatial_x_scale,
|
||||
float spatial_y_scale,
|
||||
float in_width,
|
||||
float in_height,
|
||||
int in_width,
|
||||
int in_height,
|
||||
float rcp_of_out_width,
|
||||
float rcp_of_out_height,
|
||||
float sampling_x_ratio,
|
||||
float sampling_y_ratio,
|
||||
int depth
|
||||
int depth,
|
||||
int dtype
|
||||
)
|
||||
{
|
||||
int px = get_global_id(0);
|
||||
|
|
@ -82,7 +95,10 @@ __kernel void roi_align_F32_F32toF32
|
|||
|
||||
float2 spatial_indx = (float2)(px, py);
|
||||
float2 pooled_dims = (float2)(rcp_of_out_width, rcp_of_out_height);
|
||||
float2 max_spatial_dims = (float2)(in_width, in_height);
|
||||
int4 max_spatial_dims = (int4)(in_width, in_height, in_width, in_height);
|
||||
max_spatial_dims.zw = max_spatial_dims.zw - 1;
|
||||
|
||||
float2 max_limiatation = convert_float2(max_spatial_dims.zw);
|
||||
|
||||
float2 bin_size = roi_dims * pooled_dims;
|
||||
float2 region_start = spatial_indx * bin_size + roi_anchor.xy;
|
||||
|
|
@ -105,9 +121,28 @@ __kernel void roi_align_F32_F32toF32
|
|||
bin_size,
|
||||
grid_size_xy,
|
||||
rcp_of_grid_size,
|
||||
kz);
|
||||
kz,
|
||||
max_spatial_dims);
|
||||
|
||||
write_imagef(output, (int4)(px, py, kz1, 0), interp);
|
||||
if (dtype == TYPE_FLOAT16)
|
||||
{
|
||||
half tmp;
|
||||
short dst;
|
||||
_viv_asm(CONV, tmp, interp.x);
|
||||
_viv_asm(COPY, dst, tmp, 2);
|
||||
|
||||
Tensor out_t = create_tensor_from_image2d_array(output, 2);
|
||||
short *output_ptr = (short *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));
|
||||
|
||||
output_ptr[0] = dst;
|
||||
}
|
||||
else
|
||||
{
|
||||
Tensor out_t = create_tensor_from_image2d_array(output, 4);
|
||||
float *output_ptr = (float *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));
|
||||
|
||||
output_ptr[0] = interp.x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -121,7 +156,8 @@ inline float roi_align_1x1_U8toF32
|
|||
float2 bin_size,
|
||||
int2 grid_size,
|
||||
float2 rcp_of_grid_size,
|
||||
int pz
|
||||
int pz,
|
||||
int4 max_spatial_dims
|
||||
)
|
||||
{
|
||||
float sum = 0;
|
||||
|
|
@ -132,33 +168,43 @@ inline float roi_align_1x1_U8toF32
|
|||
{
|
||||
float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);
|
||||
float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;
|
||||
|
||||
|
||||
int2 xy_low = convert_int2(pos);
|
||||
int2 xy_high = xy_low + 1;
|
||||
|
||||
float ly = pos.y - xy_low.y;
|
||||
float lx = pos.x - xy_low.x;
|
||||
float hy = 1.0f - ly;
|
||||
float hx = 1.0f - lx;
|
||||
|
||||
|
||||
float2 lxy = pos - floor(pos);
|
||||
float2 zero = 0;
|
||||
|
||||
if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||
|
||||
xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy;
|
||||
|
||||
float hy = 1.0f - lxy.y;
|
||||
float hx = 1.0f - lxy.x;
|
||||
|
||||
float w1 = hy * hx;
|
||||
float w2 = hy * lx;
|
||||
float w3 = ly * hx;
|
||||
float w4 = ly * lx;
|
||||
|
||||
float w2 = lxy.x - lxy.x * lxy.y;
|
||||
float w3 = lxy.y - lxy.x * lxy.y;
|
||||
float w4 = lxy.y * lxy.x;
|
||||
|
||||
uint4 data;
|
||||
data.x = read_imageui(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;
|
||||
data.y = read_imageui(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;
|
||||
data.z = read_imageui(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;
|
||||
data.w = read_imageui(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;
|
||||
|
||||
|
||||
float4 value = convert_float4(data) * input_scale + input_tail;
|
||||
|
||||
|
||||
sum = sum + w1 * value.x + w2 * value.y + w3 * value.z + w4 * value.w;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);
|
||||
|
||||
}
|
||||
|
||||
__kernel void roi_align_U8_U16toU8
|
||||
|
|
@ -173,13 +219,14 @@ __kernel void roi_align_U8_U16toU8
|
|||
float output_zp,
|
||||
float spatial_x_scale,
|
||||
float spatial_y_scale,
|
||||
float in_width,
|
||||
float in_height,
|
||||
int in_width,
|
||||
int in_height,
|
||||
float rcp_of_out_width,
|
||||
float rcp_of_out_height,
|
||||
float sampling_x_ratio,
|
||||
float sampling_y_ratio,
|
||||
int depth
|
||||
int depth,
|
||||
int dtype
|
||||
)
|
||||
{
|
||||
int px = get_global_id(0);
|
||||
|
|
@ -198,7 +245,10 @@ __kernel void roi_align_U8_U16toU8
|
|||
|
||||
float2 spatial_indx = (float2)(px, py);
|
||||
float2 pooled_dims = (float2)(rcp_of_out_width, rcp_of_out_height);
|
||||
float2 max_spatial_dims = (float2)(in_width, in_height);
|
||||
int4 max_spatial_dims = (int4)(in_width, in_height, in_width, in_height);
|
||||
max_spatial_dims.zw = max_spatial_dims.zw - 1;
|
||||
|
||||
float2 max_limiatation = convert_float2(max_spatial_dims.zw);
|
||||
|
||||
float2 bin_size = roi_dims * pooled_dims;
|
||||
float2 region_start = spatial_indx * bin_size + roi_anchor.xy;
|
||||
|
|
@ -223,12 +273,17 @@ __kernel void roi_align_U8_U16toU8
|
|||
bin_size,
|
||||
grid_size_xy,
|
||||
rcp_of_grid_size,
|
||||
kz);
|
||||
kz,
|
||||
max_spatial_dims);
|
||||
|
||||
uint4 dst;
|
||||
uchar dst;
|
||||
interp.x = interp.x * output_scale + output_zp;
|
||||
interp.x = interp.x < 255 ? interp.x : 255;
|
||||
dst.x = convert_uint_rte(interp.x);
|
||||
write_imageui(output, (int4)(px, py, kz1, 0), dst.xxxx);
|
||||
dst = convert_uchar_rte(interp.x);
|
||||
|
||||
Tensor out_t = create_tensor_from_image2d_array(output, 1);
|
||||
uchar *output_ptr = (uchar *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));
|
||||
|
||||
output_ptr[0] = dst;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,298 @@
|
|||
|
||||
#define SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(name, dtype) \
|
||||
__kernel void scatter_elements_axis0_##name \
|
||||
( \
|
||||
__read_only image2d_t ref, \
|
||||
__read_only image2d_t indices, \
|
||||
__read_only image2d_t update, \
|
||||
__write_only image2d_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
\
|
||||
Image ref_i = create_image_from_image2d(ref, 4); \
|
||||
Image update_i = create_image_from_image2d(update, 4); \
|
||||
Image indices_i = create_image_from_image2d(indices, 4); \
|
||||
Image output_i = create_image_from_image2d(output, 4); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
|
||||
dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
|
||||
dtype data = ref_ptr[0]; \
|
||||
if (coord.y < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
|
||||
int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
|
||||
for(int x = 0; x < axis_size; x ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[x]; \
|
||||
if (offset == coord.x) \
|
||||
{ \
|
||||
data = update_ptr[x]; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)
|
||||
SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)
|
||||
|
||||
#define SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_axis0_##name \
|
||||
( \
|
||||
__read_only image2d_t ref, \
|
||||
__read_only image2d_t indices, \
|
||||
__read_only image2d_t update, \
|
||||
__write_only image2d_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
\
|
||||
Image ref_i = create_image_from_image2d(ref, 2); \
|
||||
Image update_i = create_image_from_image2d(update, 2); \
|
||||
Image indices_i = create_image_from_image2d(indices, 4); \
|
||||
Image output_i = create_image_from_image2d(output, 2); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
|
||||
dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.y < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
|
||||
int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
|
||||
for(int x = 0; x < axis_size; x ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[x]; \
|
||||
if (offset == coord.x) \
|
||||
{ \
|
||||
data = conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)
|
||||
SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)
|
||||
SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
|
||||
|
||||
#define SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_axis0_##name \
|
||||
( \
|
||||
__read_only image2d_t ref, \
|
||||
__read_only image2d_t indices, \
|
||||
__read_only image2d_t update, \
|
||||
__write_only image2d_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
\
|
||||
Image ref_i = create_image_from_image2d(ref, 1); \
|
||||
Image update_i = create_image_from_image2d(update, 1); \
|
||||
Image indices_i = create_image_from_image2d(indices, 4); \
|
||||
Image output_i = create_image_from_image2d(output, 1); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
|
||||
dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.y < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
|
||||
int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
|
||||
for(int x = 0; x < axis_size; x ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[x]; \
|
||||
if (offset == coord.x) \
|
||||
{ \
|
||||
data = conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
|
||||
SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)
|
||||
|
||||
#define SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(name, dtype) \
|
||||
__kernel void scatter_elements_axis1_##name \
|
||||
( \
|
||||
__read_only image2d_array_t ref, \
|
||||
__read_only image2d_array_t indices, \
|
||||
__read_only image2d_array_t update, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \
|
||||
Tensor update_i = create_tensor_from_image2d_array(update, 4); \
|
||||
Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
|
||||
Tensor output_i = create_tensor_from_image2d_array(output, 4); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
|
||||
dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
|
||||
dtype data = ref_ptr[0]; \
|
||||
if (coord.x < inner_size && coord.z < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
|
||||
int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
|
||||
for(int y = 0; y < axis_size; y ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[y * inner_size]; \
|
||||
if (offset == coord.y) \
|
||||
{ \
|
||||
data = update_ptr[y * inner_size]; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)
|
||||
SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)
|
||||
|
||||
#define SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_axis1_##name \
|
||||
( \
|
||||
__read_only image2d_array_t ref, \
|
||||
__read_only image2d_array_t indices, \
|
||||
__read_only image2d_array_t update, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \
|
||||
Tensor update_i = create_tensor_from_image2d_array(update, 2); \
|
||||
Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
|
||||
Tensor output_i = create_tensor_from_image2d_array(output, 2); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
|
||||
dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.x < inner_size && coord.z < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
|
||||
int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
|
||||
for(int y = 0; y < axis_size; y ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[y * inner_size]; \
|
||||
if (offset == coord.y) \
|
||||
{ \
|
||||
data = conv_func(convert_float(update_ptr[y * inner_size]) \
|
||||
* update_scale + update_tail + output_zp); \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)
|
||||
SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)
|
||||
SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
|
||||
|
||||
#define SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_axis1_##name \
|
||||
( \
|
||||
__read_only image2d_array_t ref, \
|
||||
__read_only image2d_array_t indices, \
|
||||
__read_only image2d_array_t update, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \
|
||||
Tensor update_i = create_tensor_from_image2d_array(update, 1); \
|
||||
Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
|
||||
Tensor output_i = create_tensor_from_image2d_array(output, 1); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
|
||||
dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.x < inner_size && coord.z < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
|
||||
int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
|
||||
for(int y = 0; y < axis_size; y ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[y * inner_size]; \
|
||||
if (offset == coord.y) \
|
||||
{ \
|
||||
data = conv_func(convert_float(update_ptr[y * inner_size]) \
|
||||
* update_scale + update_tail + output_zp); \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
|
||||
SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)
|
||||
|
|
@ -0,0 +1,292 @@
|
|||
|
||||
#define SE_ADD_AXIS0_32BITS_IMPL(name, dtype) \
|
||||
__kernel void scatter_elements_add_axis0_##name \
|
||||
( \
|
||||
__read_only image2d_t ref, \
|
||||
__read_only image2d_t indices, \
|
||||
__read_only image2d_t update, \
|
||||
__write_only image2d_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
\
|
||||
Image ref_i = create_image_from_image2d(ref, 4); \
|
||||
Image update_i = create_image_from_image2d(update, 4); \
|
||||
Image indices_i = create_image_from_image2d(indices, 4); \
|
||||
Image output_i = create_image_from_image2d(output, 4); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
|
||||
dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
|
||||
dtype data = ref_ptr[0]; \
|
||||
if (coord.y < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
|
||||
int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
|
||||
for(int x = 0; x < axis_size; x ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[x]; \
|
||||
if (offset == coord.x) \
|
||||
{ \
|
||||
data += update_ptr[x]; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_ADD_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)
|
||||
SE_ADD_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)
|
||||
|
||||
#define SE_ADD_AXIS0_16BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_add_axis0_##name \
|
||||
( \
|
||||
__read_only image2d_t ref, \
|
||||
__read_only image2d_t indices, \
|
||||
__read_only image2d_t update, \
|
||||
__write_only image2d_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
\
|
||||
Image ref_i = create_image_from_image2d(ref, 2); \
|
||||
Image update_i = create_image_from_image2d(update, 2); \
|
||||
Image indices_i = create_image_from_image2d(indices, 4); \
|
||||
Image output_i = create_image_from_image2d(output, 2); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
|
||||
dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.y < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
|
||||
int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
|
||||
for(int x = 0; x < axis_size; x ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[x]; \
|
||||
if (offset == coord.x) \
|
||||
{ \
|
||||
data += conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_ADD_AXIS0_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)
|
||||
SE_ADD_AXIS0_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)
|
||||
SE_ADD_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
|
||||
|
||||
#define SE_ADD_AXIS0_8BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_add_axis0_##name \
|
||||
( \
|
||||
__read_only image2d_t ref, \
|
||||
__read_only image2d_t indices, \
|
||||
__read_only image2d_t update, \
|
||||
__write_only image2d_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
\
|
||||
Image ref_i = create_image_from_image2d(ref, 1); \
|
||||
Image update_i = create_image_from_image2d(update, 1); \
|
||||
Image indices_i = create_image_from_image2d(indices, 4); \
|
||||
Image output_i = create_image_from_image2d(output, 1); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
|
||||
dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.y < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
|
||||
int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
|
||||
for(int x = 0; x < axis_size; x ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[x]; \
|
||||
if (offset == coord.x) \
|
||||
{ \
|
||||
data += conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_ADD_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
|
||||
SE_ADD_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)
|
||||
|
||||
#define SE_ADD_AXIS1_32BITS_IMPL(name, dtype) \
|
||||
__kernel void scatter_elements_add_axis1_##name \
|
||||
( \
|
||||
__read_only image2d_array_t ref, \
|
||||
__read_only image2d_array_t indices, \
|
||||
__read_only image2d_array_t update, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \
|
||||
Tensor update_i = create_tensor_from_image2d_array(update, 4); \
|
||||
Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
|
||||
Tensor output_i = create_tensor_from_image2d_array(output, 4); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
|
||||
dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
|
||||
dtype data = ref_ptr[0]; \
|
||||
if (coord.x < inner_size && coord.z < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
|
||||
int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
|
||||
for(int y = 0; y < axis_size; y ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[y * inner_size]; \
|
||||
if (offset == coord.y) \
|
||||
{ \
|
||||
data += update_ptr[y * inner_size]; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_ADD_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)
|
||||
SE_ADD_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)
|
||||
|
||||
#define SE_ADD_AXIS1_16BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_add_axis1_##name \
|
||||
( \
|
||||
__read_only image2d_array_t ref, \
|
||||
__read_only image2d_array_t indices, \
|
||||
__read_only image2d_array_t update, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \
|
||||
Tensor update_i = create_tensor_from_image2d_array(update, 2); \
|
||||
Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
|
||||
Tensor output_i = create_tensor_from_image2d_array(output, 2); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
|
||||
dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.x < inner_size && coord.z < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
|
||||
int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
|
||||
for(int y = 0; y < axis_size; y ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[y * inner_size]; \
|
||||
if (offset == coord.y) \
|
||||
{ \
|
||||
data += conv_func(convert_float(update_ptr[y * inner_size]) \
|
||||
* update_scale + update_tail + output_zp); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_ADD_AXIS1_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)
|
||||
SE_ADD_AXIS1_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)
|
||||
SE_ADD_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
|
||||
|
||||
#define SE_ADD_AXIS1_8BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_add_axis1_##name \
|
||||
( \
|
||||
__read_only image2d_array_t ref, \
|
||||
__read_only image2d_array_t indices, \
|
||||
__read_only image2d_array_t update, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \
|
||||
Tensor update_i = create_tensor_from_image2d_array(update, 1); \
|
||||
Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
|
||||
Tensor output_i = create_tensor_from_image2d_array(output, 1); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
|
||||
dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.x < inner_size && coord.z < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
|
||||
int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
|
||||
for(int y = 0; y < axis_size; y ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[y * inner_size]; \
|
||||
if (offset == coord.y) \
|
||||
{ \
|
||||
data += conv_func(convert_float(update_ptr[y * inner_size]) \
|
||||
* update_scale + update_tail + output_zp); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_ADD_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
|
||||
SE_ADD_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)
|
||||
|
|
@ -0,0 +1,292 @@
|
|||
|
||||
#define SE_MUL_AXIS0_32BITS_IMPL(name, dtype) \
|
||||
__kernel void scatter_elements_mul_axis0_##name \
|
||||
( \
|
||||
__read_only image2d_t ref, \
|
||||
__read_only image2d_t indices, \
|
||||
__read_only image2d_t update, \
|
||||
__write_only image2d_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
\
|
||||
Image ref_i = create_image_from_image2d(ref, 4); \
|
||||
Image update_i = create_image_from_image2d(update, 4); \
|
||||
Image indices_i = create_image_from_image2d(indices, 4); \
|
||||
Image output_i = create_image_from_image2d(output, 4); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
|
||||
dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
|
||||
dtype data = ref_ptr[0]; \
|
||||
if (coord.y < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
|
||||
int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
|
||||
for(int x = 0; x < axis_size; x ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[x]; \
|
||||
if (offset == coord.x) \
|
||||
{ \
|
||||
data *= update_ptr[x]; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_MUL_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)
|
||||
SE_MUL_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)
|
||||
|
||||
#define SE_MUL_AXIS0_16BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_mul_axis0_##name \
|
||||
( \
|
||||
__read_only image2d_t ref, \
|
||||
__read_only image2d_t indices, \
|
||||
__read_only image2d_t update, \
|
||||
__write_only image2d_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
\
|
||||
Image ref_i = create_image_from_image2d(ref, 2); \
|
||||
Image update_i = create_image_from_image2d(update, 2); \
|
||||
Image indices_i = create_image_from_image2d(indices, 4); \
|
||||
Image output_i = create_image_from_image2d(output, 2); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
|
||||
dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.y < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
|
||||
int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
|
||||
for(int x = 0; x < axis_size; x ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[x]; \
|
||||
if (offset == coord.x) \
|
||||
{ \
|
||||
data *= conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_MUL_AXIS0_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)
|
||||
SE_MUL_AXIS0_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)
|
||||
SE_MUL_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
|
||||
|
||||
#define SE_MUL_AXIS0_8BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_mul_axis0_##name \
|
||||
( \
|
||||
__read_only image2d_t ref, \
|
||||
__read_only image2d_t indices, \
|
||||
__read_only image2d_t update, \
|
||||
__write_only image2d_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
|
||||
\
|
||||
Image ref_i = create_image_from_image2d(ref, 1); \
|
||||
Image update_i = create_image_from_image2d(update, 1); \
|
||||
Image indices_i = create_image_from_image2d(indices, 4); \
|
||||
Image output_i = create_image_from_image2d(output, 1); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
|
||||
dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.y < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
|
||||
int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
|
||||
for(int x = 0; x < axis_size; x ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[x]; \
|
||||
if (offset == coord.x) \
|
||||
{ \
|
||||
data *= conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_MUL_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
|
||||
SE_MUL_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)
|
||||
|
||||
#define SE_MUL_AXIS1_32BITS_IMPL(name, dtype) \
|
||||
__kernel void scatter_elements_mul_axis1_##name \
|
||||
( \
|
||||
__read_only image2d_array_t ref, \
|
||||
__read_only image2d_array_t indices, \
|
||||
__read_only image2d_array_t update, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \
|
||||
Tensor update_i = create_tensor_from_image2d_array(update, 4); \
|
||||
Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
|
||||
Tensor output_i = create_tensor_from_image2d_array(output, 4); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
|
||||
dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
|
||||
dtype data = ref_ptr[0]; \
|
||||
if (coord.x < inner_size && coord.z < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
|
||||
int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
|
||||
for(int y = 0; y < axis_size; y ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[y * inner_size]; \
|
||||
if (offset == coord.y) \
|
||||
{ \
|
||||
data *= update_ptr[y * inner_size]; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_MUL_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)
|
||||
SE_MUL_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)
|
||||
|
||||
#define SE_MUL_AXIS1_16BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_mul_axis1_##name \
|
||||
( \
|
||||
__read_only image2d_array_t ref, \
|
||||
__read_only image2d_array_t indices, \
|
||||
__read_only image2d_array_t update, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \
|
||||
Tensor update_i = create_tensor_from_image2d_array(update, 2); \
|
||||
Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
|
||||
Tensor output_i = create_tensor_from_image2d_array(output, 2); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
|
||||
dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.x < inner_size && coord.z < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
|
||||
int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
|
||||
for(int y = 0; y < axis_size; y ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[y * inner_size]; \
|
||||
if (offset == coord.y) \
|
||||
{ \
|
||||
data *= conv_func(convert_float(update_ptr[y * inner_size]) \
|
||||
* update_scale + update_tail + output_zp); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_MUL_AXIS1_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)
|
||||
SE_MUL_AXIS1_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)
|
||||
SE_MUL_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
|
||||
|
||||
#define SE_MUL_AXIS1_8BITS_IMPL(name, dtype, conv_func) \
|
||||
__kernel void scatter_elements_mul_axis1_##name \
|
||||
( \
|
||||
__read_only image2d_array_t ref, \
|
||||
__read_only image2d_array_t indices, \
|
||||
__read_only image2d_array_t update, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
int reduction, \
|
||||
float ref_scale, \
|
||||
float ref_tail, \
|
||||
float update_scale, \
|
||||
float update_tail, \
|
||||
float output_zp, \
|
||||
int inner_size, \
|
||||
int axis_size, \
|
||||
int outer_size \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
\
|
||||
Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \
|
||||
Tensor update_i = create_tensor_from_image2d_array(update, 1); \
|
||||
Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
|
||||
Tensor output_i = create_tensor_from_image2d_array(output, 1); \
|
||||
\
|
||||
dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
|
||||
dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
|
||||
dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
|
||||
if (coord.x < inner_size && coord.z < outer_size) \
|
||||
{ \
|
||||
dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
|
||||
int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
|
||||
for(int y = 0; y < axis_size; y ++) \
|
||||
{ \
|
||||
int offset = indices_ptr[y * inner_size]; \
|
||||
if (offset == coord.y) \
|
||||
{ \
|
||||
data *= conv_func(convert_float(update_ptr[y * inner_size]) \
|
||||
* update_scale + update_tail + output_zp); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
output_ptr[0] = data; \
|
||||
}
|
||||
SE_MUL_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
|
||||
SE_MUL_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)
|
||||
|
|
@ -0,0 +1,176 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniDataConvert_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniDataConvert_1_4x4;
|
||||
_viv_uniform int boundaries_size_x8;
|
||||
_viv_uniform int boundaries_size;
|
||||
|
||||
#define BUCKETIZE_16BITS_SH_IMPL(name, copy_type) \
|
||||
__kernel void bucketize_right_##name \
|
||||
( \
|
||||
__read_only image2d_t input, \
|
||||
__read_only image2d_t boundaries, \
|
||||
__write_only image2d_t output \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
|
||||
\
|
||||
vxc_short8 data0, data1; \
|
||||
copy_type src0, src1, dst0, dst1; \
|
||||
vxc_ushort8 v0, v1, v2, v3, result = 0; \
|
||||
VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, src0, data0, 16); \
|
||||
\
|
||||
for (; coord.z < boundaries_size_x8; ) \
|
||||
{ \
|
||||
VXC_ReadImage(data1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, src1, data1.s00000000, 16); \
|
||||
coord.z += 8; \
|
||||
\
|
||||
VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, v0, dst0, 16); \
|
||||
v2 = sub_sat(v0, 0xFFFE); \
|
||||
_viv_asm(COPY, src1, data1.s11111111, 16); \
|
||||
VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, v1, dst1, 16); \
|
||||
v3 = sub_sat(v1, 0xFFFE); \
|
||||
\
|
||||
result = result + v2 + v3; \
|
||||
\
|
||||
_viv_asm(COPY, src1, data1.s22222222, 16); \
|
||||
VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, v0, dst0, 16); \
|
||||
v2 = sub_sat(v0, 0xFFFE); \
|
||||
_viv_asm(COPY, src1, data1.s33333333, 16); \
|
||||
VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, v1, dst1, 16); \
|
||||
v3 = sub_sat(v1, 0xFFFE); \
|
||||
\
|
||||
result = result + v2 + v3; \
|
||||
\
|
||||
_viv_asm(COPY, src1, data1.s44444444, 16); \
|
||||
VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, v0, dst0, 16); \
|
||||
v2 = sub_sat(v0, 0xFFFE); \
|
||||
_viv_asm(COPY, src1, data1.s55555555, 16); \
|
||||
VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, v1, dst1, 16); \
|
||||
v3 = sub_sat(v1, 0xFFFE); \
|
||||
\
|
||||
result = result + v2 + v3; \
|
||||
\
|
||||
_viv_asm(COPY, src1, data1.s66666666, 16); \
|
||||
VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, v0, dst0, 16); \
|
||||
v2 = sub_sat(v0, 0xFFFE); \
|
||||
_viv_asm(COPY, src1, data1.s77777777, 16); \
|
||||
VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, v1, dst1, 16); \
|
||||
v3 = sub_sat(v1, 0xFFFE); \
|
||||
\
|
||||
result = result + v2 + v3; \
|
||||
} \
|
||||
\
|
||||
for (; coord.z < boundaries_size; ) \
|
||||
{ \
|
||||
VXC_ReadImage(data1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, src1, data1.s00000000, 16); \
|
||||
coord.z ++; \
|
||||
\
|
||||
VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, v0, dst0, 16); \
|
||||
v2 = sub_sat(v0, 0xFFFE); \
|
||||
\
|
||||
result = result + v2; \
|
||||
} \
|
||||
\
|
||||
int4 d0, d1; \
|
||||
VXC_DP4x4(d0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_0_4x4); \
|
||||
VXC_DP4x4(d1, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_1_4x4); \
|
||||
coord.z = coord.x + 4; \
|
||||
\
|
||||
write_imagei(output, coord.xy, d0); \
|
||||
write_imagei(output, coord.zy, d1); \
|
||||
}
|
||||
BUCKETIZE_16BITS_SH_IMPL(F16_F16toI32_2D, vxc_half8)
|
||||
BUCKETIZE_16BITS_SH_IMPL(I16_I16toI32_2D, vxc_short8)
|
||||
|
||||
#define BUCKETIZE_8BITS_SH_IMPL(name, src_type) \
|
||||
__kernel void bucketize_right_##name \
|
||||
( \
|
||||
__read_only image2d_t input, \
|
||||
__read_only image2d_t boundaries, \
|
||||
__write_only image2d_t output \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
|
||||
\
|
||||
src_type src0, src1, src2; \
|
||||
vxc_uchar8 dst0, dst1, result = 0; \
|
||||
VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
for (; coord.z < boundaries_size_x8; ) \
|
||||
{ \
|
||||
VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord.z += 8; \
|
||||
\
|
||||
VXC_Clamp(src2, src0, src1.s00000000, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, dst0, src2, 8); \
|
||||
dst0 = sub_sat(dst0, 0xFE); \
|
||||
VXC_Clamp(src2, src0, src1.s11111111, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, dst1, src2, 8); \
|
||||
dst1 = sub_sat(dst1, 0xFE); \
|
||||
\
|
||||
result = result + dst0 + dst1; \
|
||||
\
|
||||
VXC_Clamp(src2, src0, src1.s22222222, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, dst0, src2, 8); \
|
||||
dst0 = sub_sat(dst0, 0xFE); \
|
||||
VXC_Clamp(src2, src0, src1.s33333333, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, dst1, src2, 8); \
|
||||
dst1 = sub_sat(dst1, 0xFE); \
|
||||
\
|
||||
result = result + dst0 + dst1; \
|
||||
\
|
||||
VXC_Clamp(src2, src0, src1.s44444444, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, dst0, src2, 8); \
|
||||
dst0 = sub_sat(dst0, 0xFE); \
|
||||
VXC_Clamp(src2, src0, src1.s55555555, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, dst1, src2, 8); \
|
||||
dst1 = sub_sat(dst1, 0xFE); \
|
||||
\
|
||||
result = result + dst0 + dst1; \
|
||||
\
|
||||
VXC_Clamp(src2, src0, src1.s66666666, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, dst0, src2, 8); \
|
||||
dst0 = sub_sat(dst0, 0xFE); \
|
||||
VXC_Clamp(src2, src0, src1.s77777777, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, dst1, src2, 8); \
|
||||
dst1 = sub_sat(dst1, 0xFE); \
|
||||
\
|
||||
result = result + dst0 + dst1; \
|
||||
} \
|
||||
\
|
||||
for (; coord.z < boundaries_size; ) \
|
||||
{ \
|
||||
VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord.z ++; \
|
||||
\
|
||||
VXC_Clamp(src2, src0, src1.s00000000, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
|
||||
_viv_asm(COPY, dst0, src2, 8); \
|
||||
dst0 = sub_sat(dst0, 0xFE); \
|
||||
\
|
||||
result = result + dst0; \
|
||||
} \
|
||||
\
|
||||
int4 d0, d1; \
|
||||
VXC_DP4x4(d0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_0_4x4); \
|
||||
VXC_DP4x4(d1, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_1_4x4); \
|
||||
coord.z = coord.x + 4; \
|
||||
\
|
||||
write_imagei(output, coord.xy, d0); \
|
||||
write_imagei(output, coord.zy, d1); \
|
||||
}
|
||||
BUCKETIZE_8BITS_SH_IMPL(U8_U8toI32_2D, vxc_uchar8)
|
||||
BUCKETIZE_8BITS_SH_IMPL(I8_I8toI32_2D, vxc_char8)
|
||||
|
|
@ -98,7 +98,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
|
|||
vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \
|
||||
half4 tmpVal0, tmpVal1; \
|
||||
float alpha = scale_vari; \
|
||||
float alpha = scale_vari * input_scale; \
|
||||
alpha = scale_vari * input_scale; \
|
||||
bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
|
||||
bias_val = bias_val - input_zp * alpha; \
|
||||
\
|
||||
|
|
|
|||
|
|
@ -1,20 +1,14 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform int width;
|
||||
_viv_uniform int height;
|
||||
_viv_uniform float inv_multiplier;
|
||||
_viv_uniform int group_num;
|
||||
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniSum_X_X2_16x2;
|
||||
_viv_uniform float input_scale;
|
||||
_viv_uniform float input_scale2;
|
||||
_viv_uniform float input_zp;
|
||||
_viv_uniform float sum_x_tail;
|
||||
_viv_uniform float sum_x2_tail0;
|
||||
_viv_uniform float sum_x2_tail1;
|
||||
_viv_uniform float output_scale;
|
||||
_viv_uniform float output_zp;
|
||||
|
||||
_viv_uniform VXC_512Bits uniSumX_16x1;
|
||||
_viv_uniform VXC_512Bits uniSumX2_16x1;
|
||||
|
|
@ -23,7 +17,7 @@ _viv_uniform VXC_512Bits uniSumX2_16x1;
|
|||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
float eps, int rs_flag) \
|
||||
float eps, int height) \
|
||||
{ \
|
||||
int gidx = get_global_id(0) << 4; \
|
||||
int lidx = get_local_id(0); \
|
||||
|
|
@ -81,7 +75,7 @@ INSTANCE_NORM_SUMS_8BITS_IMPL(I8, vxc_char16)
|
|||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
float eps, int rs_flag) \
|
||||
float eps, int height) \
|
||||
{ \
|
||||
int gidx = get_global_id(0) << 4; \
|
||||
int lidx = get_local_id(0); \
|
||||
|
|
@ -134,18 +128,62 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums
|
|||
INSTANCE_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16)
|
||||
INSTANCE_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16)
|
||||
|
||||
__kernel void instance_norm_means
|
||||
(
|
||||
__read_only image2d_t sums,
|
||||
__read_only image2d_t bias,
|
||||
__read_only image2d_t scale,
|
||||
__write_only image2d_t means,
|
||||
float eps,
|
||||
float in_time_out_scale,
|
||||
float input_zp,
|
||||
float output_scale,
|
||||
float output_zp,
|
||||
float inv_multiplier,
|
||||
int group_num
|
||||
)
|
||||
{
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));
|
||||
|
||||
Image sums_img = create_image_from_image2d(sums, 4);
|
||||
float4 *sums_ptr = (float4 *)get_image_ptr_from_coord(sums_img, coord);
|
||||
|
||||
float alpha = read_imagef(scale, coord).x;
|
||||
float beta = read_imagef(bias, coord).x;
|
||||
|
||||
float4 mean_var = sums_ptr[0];
|
||||
for(int i = 1; i < group_num;)
|
||||
{
|
||||
mean_var += sums_ptr[i];
|
||||
i ++;
|
||||
}
|
||||
|
||||
mean_var *= inv_multiplier;
|
||||
mean_var.s1 = mean_var.s1 - mean_var.s0 * mean_var.s0 + eps;
|
||||
mean_var.s1 = rsqrt(mean_var.s1);
|
||||
|
||||
alpha = alpha * mean_var.y;
|
||||
|
||||
float4 dst;
|
||||
dst.x = in_time_out_scale * alpha;
|
||||
beta = (beta - alpha * mean_var.x) * output_scale + output_zp;
|
||||
dst.y = beta - input_zp * dst.x;
|
||||
|
||||
Image means_img = create_image_from_image2d(means, 4);
|
||||
float4 *means_ptr = (float4 *)get_image_ptr_from_coord(means_img, coord);
|
||||
means_ptr[0] = dst.xyxy;
|
||||
}
|
||||
|
||||
_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
|
||||
_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;
|
||||
_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;
|
||||
#define INSTANCE_NORM_8BITS_IMPL(name, src_type, dst_type) \
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \
|
||||
__kernel void instance_norm_##name( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_t bias, \
|
||||
__read_only image2d_t scale, \
|
||||
__read_only image2d_t meanVari, \
|
||||
__read_only image2d_t means, \
|
||||
__write_only image2d_array_t output, \
|
||||
float eps, int rs_flag) \
|
||||
int height) \
|
||||
{ \
|
||||
int gidz = get_global_id(1); \
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
|
||||
|
|
@ -153,26 +191,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
|
|||
int2 coord_para = (int2)(0, gidz); \
|
||||
src_type src0; \
|
||||
dst_type dst; \
|
||||
float scale_vari, bias_val; \
|
||||
float4 bias_f, scale_f, mean_vari = (float4)(0); \
|
||||
float4 coef; \
|
||||
\
|
||||
scale_f = read_imagef(scale, coord_para); \
|
||||
bias_f = read_imagef(bias, coord_para); \
|
||||
for(int i = 0; i < group_num; i++) \
|
||||
{ \
|
||||
mean_vari += read_imagef(meanVari, coord_para); \
|
||||
coord_para.x += 4; \
|
||||
} \
|
||||
mean_vari *= inv_multiplier; \
|
||||
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
|
||||
mean_vari.s1 = rsqrt(mean_vari.s1); \
|
||||
\
|
||||
scale_vari = scale_f.s0 * mean_vari.s1; \
|
||||
vxc_int4 tmpVal0, tmpVal1; \
|
||||
coef = read_imagef(means, coord_para); \
|
||||
int4 tmpVal0, tmpVal1; \
|
||||
float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \
|
||||
float alpha = input_scale * output_scale * scale_vari; \
|
||||
bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
|
||||
bias_val = bias_val - input_zp * alpha; \
|
||||
\
|
||||
int8 input_desc, output_desc; \
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||
|
|
@ -191,14 +214,14 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
|
|||
VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
||||
VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
|
||||
VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
|
||||
norm = tmpData0 * alpha + bias_val; \
|
||||
norm = tmpData0 * coef.x + coef.y; \
|
||||
tmpVal0 = convert_int4_rte(norm); \
|
||||
norm = tmpData1 * alpha + bias_val; \
|
||||
norm = tmpData1 * coef.x + coef.y; \
|
||||
tmpVal1 = convert_int4_rte(norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
norm = tmpData2 * alpha + bias_val; \
|
||||
norm = tmpData2 * coef.x + coef.y; \
|
||||
tmpVal0 = convert_int4_rte(norm); \
|
||||
norm = tmpData3 * alpha + bias_val; \
|
||||
norm = tmpData3 * coef.x + coef.y; \
|
||||
tmpVal1 = convert_int4_rte(norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
|
||||
|
|
@ -208,60 +231,46 @@ INSTANCE_NORM_8BITS_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16)
|
|||
INSTANCE_NORM_8BITS_IMPL(I8_F32toI8, vxc_char16, vxc_char16)
|
||||
|
||||
#define INSTANCE_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \
|
||||
__kernel void instance_norm_##name##_2D( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_t bias, \
|
||||
__read_only image2d_t scale, \
|
||||
__read_only image2d_t meanVari, \
|
||||
__read_only image2d_t means, \
|
||||
__write_only image2d_array_t output, \
|
||||
float eps, int rs_flag) \
|
||||
int height) \
|
||||
{ \
|
||||
int gidz = get_global_id(1); \
|
||||
int gidy = gidz * height; \
|
||||
int2 coord = (int2)(get_global_id(0), gidy); \
|
||||
int4 coord; \
|
||||
int2 coord_para = (int2)(0, gidz); \
|
||||
int endH = gidy + height; \
|
||||
src_type src0; \
|
||||
dst_type dst; \
|
||||
float scale_vari, bias_val; \
|
||||
float4 bias_f, scale_f, mean_vari = (float4)(0); \
|
||||
float4 coef; \
|
||||
\
|
||||
scale_f = read_imagef(scale, coord_para); \
|
||||
bias_f = read_imagef(bias, coord_para); \
|
||||
for(int i = 0; i < group_num; i++) \
|
||||
{ \
|
||||
mean_vari += read_imagef(meanVari, coord_para); \
|
||||
coord_para.x += 4; \
|
||||
} \
|
||||
mean_vari *= inv_multiplier; \
|
||||
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
|
||||
mean_vari.s1 = rsqrt(mean_vari.s1); \
|
||||
\
|
||||
scale_vari = scale_f.s0 * mean_vari.s1; \
|
||||
vxc_int4 tmpVal0, tmpVal1; \
|
||||
coef = read_imagef(means, coord_para); \
|
||||
int4 tmpVal0, tmpVal1; \
|
||||
float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \
|
||||
float alpha = input_scale * output_scale * scale_vari; \
|
||||
bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
|
||||
bias_val = bias_val - input_zp * alpha; \
|
||||
\
|
||||
for(; coord.y < endH; coord.y++) \
|
||||
coord = (int4)(get_global_id(0), gidy, gidy - 1, gidy - 1); \
|
||||
\
|
||||
for(; coord.y < endH; ) \
|
||||
{ \
|
||||
VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord.yz++; \
|
||||
VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
|
||||
VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
||||
VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
|
||||
VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
|
||||
norm = tmpData0 * alpha + bias_val; \
|
||||
norm = tmpData0 * coef.x + coef.y; \
|
||||
tmpVal0 = convert_int4_rte(norm); \
|
||||
norm = tmpData1 * alpha + bias_val; \
|
||||
norm = tmpData1 * coef.x + coef.y; \
|
||||
tmpVal1 = convert_int4_rte(norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
norm = tmpData2 * alpha + bias_val; \
|
||||
norm = tmpData2 * coef.x + coef.y; \
|
||||
tmpVal0 = convert_int4_rte(norm); \
|
||||
norm = tmpData3 * alpha + bias_val; \
|
||||
norm = tmpData3 * coef.x + coef.y; \
|
||||
tmpVal1 = convert_int4_rte(norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage(output, coord.xz, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
} \
|
||||
}
|
||||
INSTANCE_NORM_8BITS_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16)
|
||||
|
|
|
|||
|
|
@ -1,11 +1,5 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform int width;
|
||||
_viv_uniform int height;
|
||||
_viv_uniform float inv_multiplier;
|
||||
_viv_uniform int group_num;
|
||||
_viv_uniform float input_scale;
|
||||
_viv_uniform float input_zp;
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
|
||||
|
|
@ -14,13 +8,11 @@ _viv_uniform VXC_512Bits uniDataToFP32_2_4x4;
|
|||
_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;
|
||||
|
||||
#define INSTANCE_NORM_8_TO_F16_IMPL(name, src_type) \
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \
|
||||
__kernel void instance_norm_##name( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_t bias, \
|
||||
__read_only image2d_t scale, \
|
||||
__read_only image2d_t meanVari, \
|
||||
__read_only image2d_t means, \
|
||||
__write_only image2d_array_t output, \
|
||||
float eps, int rs_flag) \
|
||||
int height) \
|
||||
{ \
|
||||
int gidz = get_global_id(1); \
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
|
||||
|
|
@ -28,25 +20,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
|
|||
src_type src0; \
|
||||
vxc_short8 outval; \
|
||||
vxc_half8 dst; \
|
||||
float scale_vari, bias_val; \
|
||||
float4 bias_f, scale_f, mean_vari = (float4)(0); \
|
||||
float4 coef; \
|
||||
\
|
||||
scale_f = read_imagef(scale, coord_para.xy); \
|
||||
bias_f = read_imagef(bias, coord_para.xy); \
|
||||
for(int i = 0; i < group_num; i++) \
|
||||
{ \
|
||||
mean_vari += read_imagef(meanVari, coord_para.xy); \
|
||||
coord_para.x += 4; \
|
||||
} \
|
||||
mean_vari *= inv_multiplier; \
|
||||
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
|
||||
mean_vari.s1 = rsqrt(mean_vari.s1); \
|
||||
scale_vari = scale_f.s0 * mean_vari.s1; \
|
||||
coef = read_imagef(means, coord_para.xy); \
|
||||
float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \
|
||||
half4 tmpVal0, tmpVal1; \
|
||||
float alpha = scale_vari * input_scale; \
|
||||
bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
|
||||
bias_val = bias_val - input_zp * alpha; \
|
||||
\
|
||||
coord_para = coord; \
|
||||
int8 input_desc, output_desc; \
|
||||
|
|
@ -67,17 +45,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
|
|||
VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
||||
VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
|
||||
VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
|
||||
norm = alpha * tmpData0 + bias_val; \
|
||||
norm = tmpData0 * coef.x + coef.y; \
|
||||
_viv_asm(CONV, tmpVal0, norm); \
|
||||
norm = alpha * tmpData1 + bias_val; \
|
||||
norm = tmpData1 * coef.x + coef.y; \
|
||||
_viv_asm(CONV, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, outval, dst, 16); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
coord_para.x += 8; \
|
||||
norm = alpha * tmpData2 + bias_val; \
|
||||
norm = tmpData2 * coef.x + coef.y; \
|
||||
_viv_asm(CONV, tmpVal0, norm); \
|
||||
norm = alpha * tmpData3 + bias_val; \
|
||||
norm = tmpData3 * coef.x + coef.y; \
|
||||
_viv_asm(CONV, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, outval, dst, 16); \
|
||||
|
|
@ -88,13 +66,11 @@ INSTANCE_NORM_8_TO_F16_IMPL(U8_F32toF16, vxc_uchar16)
|
|||
INSTANCE_NORM_8_TO_F16_IMPL(I8_F32toF16, vxc_char16)
|
||||
|
||||
#define INSTANCE_NORM_8_TO_F16_IMPL_2D(name, src_type) \
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \
|
||||
__kernel void instance_norm_##name##_2D( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_t bias, \
|
||||
__read_only image2d_t scale, \
|
||||
__read_only image2d_t meanVari, \
|
||||
__read_only image2d_t means, \
|
||||
__write_only image2d_array_t output, \
|
||||
float eps, int rs_flag) \
|
||||
int height) \
|
||||
{ \
|
||||
int gidz = get_global_id(1); \
|
||||
int gidy = gidz * height; \
|
||||
|
|
@ -104,26 +80,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
|
|||
src_type src0; \
|
||||
vxc_short8 outval; \
|
||||
vxc_half8 dst; \
|
||||
float scale_vari, bias_val; \
|
||||
float4 bias_f, scale_f, mean_vari = (float4)(0); \
|
||||
float4 coef; \
|
||||
\
|
||||
scale_f = read_imagef(scale, coord_para.xy); \
|
||||
bias_f = read_imagef(bias, coord_para.xy); \
|
||||
for(int i = 0; i < group_num; i++) \
|
||||
{ \
|
||||
mean_vari += read_imagef(meanVari, coord_para.xy); \
|
||||
coord_para.x += 4; \
|
||||
} \
|
||||
mean_vari *= inv_multiplier; \
|
||||
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
|
||||
mean_vari.s1 = rsqrt(mean_vari.s1); \
|
||||
\
|
||||
scale_vari = scale_f.s0 * mean_vari.s1; \
|
||||
coef = read_imagef(means, coord_para.xy); \
|
||||
float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \
|
||||
half4 tmpVal0, tmpVal1; \
|
||||
float alpha = scale_vari * input_scale; \
|
||||
bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
|
||||
bias_val = bias_val - input_zp * alpha; \
|
||||
for(; coord.y < endH;) \
|
||||
{ \
|
||||
VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
|
|
@ -133,17 +94,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
|
|||
VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
||||
VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
|
||||
VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
|
||||
norm = alpha * tmpData0 + bias_val; \
|
||||
norm = tmpData0 * coef.x + coef.y; \
|
||||
_viv_asm(CONV, tmpVal0, norm); \
|
||||
norm = alpha * tmpData1 + bias_val; \
|
||||
norm = tmpData1 * coef.x + coef.y; \
|
||||
_viv_asm(CONV, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, outval, dst, 16); \
|
||||
VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_para.x += 8; \
|
||||
norm = alpha * tmpData2 + bias_val; \
|
||||
norm = tmpData2 * coef.x + coef.y; \
|
||||
_viv_asm(CONV, tmpVal0, norm); \
|
||||
norm = alpha * tmpData3 + bias_val; \
|
||||
norm = tmpData3 * coef.x + coef.y; \
|
||||
_viv_asm(CONV, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, outval, dst, 16); \
|
||||
|
|
|
|||
|
|
@ -1,28 +1,21 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform int width;
|
||||
_viv_uniform int height;
|
||||
_viv_uniform float inv_multiplier;
|
||||
_viv_uniform int group_num;
|
||||
_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniSum_X_X2_8x2;
|
||||
_viv_uniform float input_scale;
|
||||
_viv_uniform float input_scale2;
|
||||
_viv_uniform float input_zp;
|
||||
_viv_uniform float sum_x_tail;
|
||||
_viv_uniform float sum_x2_tail0;
|
||||
_viv_uniform float sum_x2_tail1;
|
||||
|
||||
_viv_uniform float output_scale;
|
||||
_viv_uniform float output_zp;
|
||||
|
||||
#define INSTANCE_NORM_SUMS_16BITS_IMPL(name, src_type) \
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
float eps, int rs_flag) \
|
||||
float eps, int height) \
|
||||
{ \
|
||||
int gidx = get_global_id(0) << 3; \
|
||||
int lidx = get_local_id(0); \
|
||||
|
|
@ -87,7 +80,7 @@ INSTANCE_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)
|
|||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
float eps, int rs_flag) \
|
||||
float eps, int height) \
|
||||
{ \
|
||||
int gidx = get_global_id(0) << 3; \
|
||||
int lidx = get_local_id(0); \
|
||||
|
|
@ -146,13 +139,11 @@ INSTANCE_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)
|
|||
INSTANCE_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)
|
||||
|
||||
#define INSTANCE_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \
|
||||
__kernel void instance_norm_##name( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_t bias, \
|
||||
__read_only image2d_t scale, \
|
||||
__read_only image2d_t meanVari, \
|
||||
__read_only image2d_t means, \
|
||||
__write_only image2d_array_t output, \
|
||||
float eps, int rs_flag) \
|
||||
int height) \
|
||||
{ \
|
||||
int gidz = get_global_id(1); \
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
|
||||
|
|
@ -160,28 +151,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
|
|||
int4 coord_para = (int4)(0, gidz, 0, 0); \
|
||||
vxc_short8 src0; \
|
||||
src_type in_h; \
|
||||
float scale_vari, bias_val; \
|
||||
float4 bias_f, scale_f, mean_vari = (float4)(0); \
|
||||
float4 coef; \
|
||||
\
|
||||
scale_f = read_imagef(scale, coord_para.xy); \
|
||||
bias_f = read_imagef(bias, coord_para.xy); \
|
||||
coef = read_imagef(means, coord_para.xy); \
|
||||
\
|
||||
for(int i = 0; i < group_num; i++) \
|
||||
{ \
|
||||
mean_vari += read_imagef(meanVari, coord_para.xy); \
|
||||
coord_para.x += 4; \
|
||||
} \
|
||||
mean_vari *= inv_multiplier; \
|
||||
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
|
||||
mean_vari.s1 = rsqrt(mean_vari.s1); \
|
||||
\
|
||||
scale_vari = scale_f.s0 * mean_vari.s1; \
|
||||
float alpha = input_scale * output_scale * scale_vari; \
|
||||
float4 tmpData0, tmpData1; \
|
||||
copy_type outval; \
|
||||
conv_type tmpVal0, tmpVal1; \
|
||||
bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
|
||||
bias_val = bias_val - input_zp * alpha; \
|
||||
dst_type dst; \
|
||||
\
|
||||
int8 input_desc, output_desc; \
|
||||
|
|
@ -204,9 +180,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
|
|||
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
||||
\
|
||||
float4 norm; \
|
||||
norm = alpha * tmpData0 + bias_val; \
|
||||
norm = tmpData0 * coef.x + coef.y; \
|
||||
_viv_asm(CONV_RTE, tmpVal0, norm); \
|
||||
norm = alpha * tmpData1 + bias_val; \
|
||||
norm = tmpData1 * coef.x + coef.y; \
|
||||
_viv_asm(CONV_RTE, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, outval, dst, 16); \
|
||||
|
|
@ -221,13 +197,11 @@ INSTANCE_NORM_16BITS_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4
|
|||
INSTANCE_NORM_16BITS_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)
|
||||
|
||||
#define INSTANCE_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \
|
||||
__kernel void instance_norm_##name##_2D( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_t bias, \
|
||||
__read_only image2d_t scale, \
|
||||
__read_only image2d_t meanVari, \
|
||||
__read_only image2d_t means, \
|
||||
__write_only image2d_array_t output, \
|
||||
float eps, int rs_flag) \
|
||||
int height) \
|
||||
{ \
|
||||
int gidz = get_global_id(1); \
|
||||
int gidy = gidz * height; \
|
||||
|
|
@ -236,28 +210,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
|
|||
int endH = gidy + height; \
|
||||
vxc_short8 src0; \
|
||||
src_type in_h; \
|
||||
float scale_vari, bias_val; \
|
||||
float4 bias_f, scale_f, mean_vari = (float4)(0); \
|
||||
float4 coef; \
|
||||
\
|
||||
scale_f = read_imagef(scale, coord_para.xy); \
|
||||
bias_f = read_imagef(bias, coord_para.xy); \
|
||||
coef = read_imagef(means, coord_para.xy); \
|
||||
\
|
||||
for(int i = 0; i < group_num; i++) \
|
||||
{ \
|
||||
mean_vari += read_imagef(meanVari, coord_para.xy); \
|
||||
coord_para.x += 4; \
|
||||
} \
|
||||
mean_vari *= inv_multiplier; \
|
||||
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
|
||||
mean_vari.s1 = rsqrt(mean_vari.s1); \
|
||||
\
|
||||
scale_vari = scale_f.s0 * mean_vari.s1; \
|
||||
float alpha = input_scale * output_scale * scale_vari; \
|
||||
float4 tmpData0, tmpData1; \
|
||||
copy_type outval; \
|
||||
conv_type tmpVal0, tmpVal1; \
|
||||
bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
|
||||
bias_val = bias_val - input_zp * alpha; \
|
||||
dst_type dst; \
|
||||
\
|
||||
for(; coord.y < endH; coord.y++) \
|
||||
|
|
@ -268,9 +227,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
|
|||
VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
|
||||
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
||||
float4 norm; \
|
||||
norm = alpha * tmpData0 + bias_val; \
|
||||
norm = tmpData0 * coef.x + coef.y; \
|
||||
_viv_asm(CONV, tmpVal0, norm); \
|
||||
norm = alpha * tmpData1 + bias_val; \
|
||||
norm = tmpData1 * coef.x + coef.y; \
|
||||
_viv_asm(CONV, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, outval, dst, 16); \
|
||||
|
|
|
|||
|
|
@ -1,15 +1,13 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform int width;
|
||||
_viv_uniform int height;
|
||||
_viv_uniform float inv_multiplier;
|
||||
_viv_uniform int group_num;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16(
|
||||
image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
|
||||
image2d_array_t input, image2d_array_t output, float eps, int height)
|
||||
{
|
||||
int gidx = get_global_id(0) << 3;
|
||||
int lidx = get_local_id(0);
|
||||
|
|
@ -70,7 +68,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums
|
|||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16_2D(
|
||||
image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
|
||||
image2d_array_t input, image2d_array_t output, float eps, int height)
|
||||
{
|
||||
int gidx = get_global_id(0) << 3;
|
||||
int lidx = get_local_id(0);
|
||||
|
|
@ -129,36 +127,21 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums
|
|||
}
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16_F32toBF16(
|
||||
image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
|
||||
image2d_array_t output, float eps, int rsFlg)
|
||||
__kernel void instance_norm_BF16_F32toBF16(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_t means,
|
||||
__write_only image2d_array_t output,
|
||||
int height)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
vxc_short8 src0, src1, src2;
|
||||
float scale_vari, bias_val;
|
||||
float4 mean_vari = (float4)(0);
|
||||
float4 coef;
|
||||
|
||||
Image img3 = create_image_from_image2d(meanVari, 4);
|
||||
__global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));
|
||||
__global float4* vari_ptr = (__global float4*)sumVari_ptr;
|
||||
coef = read_imagef(means, coord.yz);
|
||||
|
||||
float sval = read_imagef(scale, coord.yz).x;
|
||||
float bval = read_imagef(bias, coord.yz).x;
|
||||
|
||||
for(int i = 0; i < group_num; i++)
|
||||
{
|
||||
mean_vari += vari_ptr[i];
|
||||
}
|
||||
|
||||
mean_vari *= inv_multiplier;
|
||||
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
|
||||
mean_vari.s1 = rsqrt(mean_vari.s1);
|
||||
|
||||
scale_vari = sval * mean_vari.s1;
|
||||
float4 tmpData0, tmpData1;
|
||||
bias_val = (bval - scale_vari * mean_vari.s0);
|
||||
|
||||
int8 input_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
|
|
@ -171,6 +154,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
|
|||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
|
|
@ -182,9 +166,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
|
|||
_viv_asm(COPY, tmpData1, src2, 16);
|
||||
|
||||
float4 norm;
|
||||
norm = scale_vari * tmpData0 + bias_val;
|
||||
norm = tmpData0 * coef.x + coef.y;
|
||||
_viv_asm(COPY, src0, norm, 16);
|
||||
norm = scale_vari * tmpData1 + bias_val;
|
||||
norm = tmpData1 * coef.x + coef.y;
|
||||
_viv_asm(COPY, src1, norm, 16);
|
||||
VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, src2, \
|
||||
|
|
@ -192,41 +176,27 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
|
|||
}
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16_2D(
|
||||
image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
|
||||
image2d_array_t output, float eps, int rsFlg)
|
||||
__kernel void instance_norm_BF16_F32toBF16_2D(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_t means,
|
||||
__write_only image2d_array_t output,
|
||||
int height)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int gidy = gidz * height;
|
||||
int2 coord = (int2)(get_global_id(0), gidy);
|
||||
int2 coord_para = (int2)(gidz, 0);
|
||||
int2 coord_para = (int2)(0, gidz);
|
||||
int endH = gidy + height;
|
||||
vxc_short8 src0, src1, src2;
|
||||
float scale_vari, bias_val;
|
||||
float4 mean_vari = (float4)(0);
|
||||
float4 coef;
|
||||
|
||||
Image img3 = create_image_from_image2d(meanVari, 4);
|
||||
__global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);
|
||||
__global float4* vari_ptr = (__global float4*)sumVari_ptr;
|
||||
coef = read_imagef(means, coord_para);
|
||||
|
||||
float sval = read_imagef(scale, coord_para.yx).x;
|
||||
float bval = read_imagef(bias, coord_para.yx).x;
|
||||
|
||||
for(int i = 0; i < group_num; i++)
|
||||
{
|
||||
mean_vari += vari_ptr[i];
|
||||
}
|
||||
|
||||
mean_vari *= inv_multiplier;
|
||||
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
|
||||
mean_vari.s1 = rsqrt(mean_vari.s1);
|
||||
|
||||
scale_vari = sval * mean_vari.s1;
|
||||
float4 tmpData0, tmpData1;
|
||||
bias_val = (bval - scale_vari * mean_vari.s0);
|
||||
|
||||
for(; coord.y < endH; coord.y++)
|
||||
{
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
|
|
@ -237,9 +207,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
|
|||
_viv_asm(COPY, tmpData1, src2, 16);
|
||||
|
||||
float4 norm;
|
||||
norm = scale_vari * tmpData0 + bias_val;
|
||||
norm = tmpData0 * coef.x + coef.y;
|
||||
_viv_asm(COPY, src0, norm, 16);
|
||||
norm = scale_vari * tmpData1 + bias_val;
|
||||
norm = tmpData1 * coef.x + coef.y;
|
||||
_viv_asm(COPY, src1, norm, 16);
|
||||
VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
|
|
|||
|
|
@ -150,7 +150,7 @@ _viv_uniform int inputZP;
|
|||
VXC_Vstore3(dst_ptr, 0, dst.s012); \
|
||||
break; \
|
||||
case 4: \
|
||||
VXC_Vstore4(dst_ptr, 0, dst.0123); \
|
||||
VXC_Vstore4(dst_ptr, 0, dst.s0123); \
|
||||
break; \
|
||||
case 5: \
|
||||
VXC_Vstore2(dst_ptr, 0, dst.s01); \
|
||||
|
|
@ -165,7 +165,7 @@ _viv_uniform int inputZP;
|
|||
VXC_Vstore3(dst_ptr, 0, dst.s012); \
|
||||
break; \
|
||||
case 7: \
|
||||
VXC_Vstore4(dst_ptr, 0, dst.0123); \
|
||||
VXC_Vstore4(dst_ptr, 0, dst.s0123); \
|
||||
dst.s012 = dst.s456; \
|
||||
dst_ptr += 4; \
|
||||
VXC_Vstore3(dst_ptr, 0, dst.s012); \
|
||||
|
|
|
|||
|
|
@ -10,6 +10,11 @@ _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
|
|||
_viv_uniform int ac2zero;
|
||||
_viv_uniform int bc2zero;
|
||||
|
||||
_viv_uniform VXC_512Bits uniI16MulI16SumtoI32_16x1;
|
||||
_viv_uniform VXC_512Bits uniI16MulI16SumtoI32B_16x1;
|
||||
_viv_uniform float inout_beta;
|
||||
_viv_uniform float inout_scale;
|
||||
|
||||
#define GEMM_QINT_TO_QINT(src0_type_name, read_type) \
|
||||
__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \
|
||||
image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \
|
||||
|
|
@ -102,3 +107,139 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \
|
|||
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
GEMM_QINT_TO_QINT(I16, vxc_short8)
|
||||
|
||||
__kernel void gemm_transb_I16I16toI16(image2d_array_t inputA,
|
||||
image2d_array_t inputB, image2d_array_t output,
|
||||
int transposeA, int transposeB, int adjointA, int adjointB,
|
||||
uint M, uint K, uint N)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);
|
||||
int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);
|
||||
|
||||
vxc_float4 sum0 = (vxc_float4)(0);
|
||||
vxc_float4 sum1 = (vxc_float4)(0);
|
||||
vxc_float4 sum2 = (vxc_float4)(0);
|
||||
vxc_float4 sum3 = (vxc_float4)(0);
|
||||
|
||||
int8 inputA_desc, inputB_desc, output_desc;
|
||||
_viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
|
||||
int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
|
||||
_viv_asm(MOV, coord_a.w, baseAddr_a);
|
||||
_viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
|
||||
int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
|
||||
_viv_asm(MOV, coord_b.w, baseAddr_b);
|
||||
|
||||
for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)
|
||||
{
|
||||
vxc_short8 srcA0,srcA1,srcA2,srcA3;
|
||||
vxc_short8 srcB0,srcB1,srcB2,srcB3;
|
||||
VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_a.x += 8;
|
||||
coord_b.x += 8;
|
||||
|
||||
vxc_int4 iVal;
|
||||
vxc_float4 fpVal;
|
||||
VXC_DP16x1(iVal, srcA0, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA0, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA0, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA0, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
sum0 = sum0 + fpVal * inout_scale + inout_beta;
|
||||
|
||||
VXC_DP16x1(iVal, srcA1, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA1, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA1, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA1, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
sum1 = sum1 + fpVal * inout_scale + inout_beta;
|
||||
|
||||
VXC_DP16x1(iVal, srcA2, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA2, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA2, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA2, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
sum2 = sum2 + fpVal * inout_scale + inout_beta;
|
||||
|
||||
VXC_DP16x1(iVal, srcA3, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA3, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA3, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
VXC_DP16x1(iVal, srcA3, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32_16x1);
|
||||
VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),
|
||||
uniI16MulI16SumtoI32B_16x1);
|
||||
sum3 = sum3 + fpVal * inout_scale + inout_beta;
|
||||
}
|
||||
vxc_int4 tmpOut0, tmpOut1;
|
||||
vxc_short8 valDst;
|
||||
tmpOut0 = convert_int4_rte(sum0);
|
||||
tmpOut1 = convert_int4_rte(sum1);
|
||||
VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
tmpOut0 = convert_int4_rte(sum2);
|
||||
tmpOut1 = convert_int4_rte(sum3);
|
||||
VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,86 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
|
||||
_viv_uniform float outputScaleVar;
|
||||
_viv_uniform float bMeanScaleVarZp;
|
||||
_viv_uniform float gMeanScaleVarZp;
|
||||
_viv_uniform float rMeanScaleVarZp;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
|
||||
|
||||
#define NV12_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
|
||||
__kernel void pre_process_nv12_copy_##name \
|
||||
( \
|
||||
__read_only image2d_array_t y_img, \
|
||||
__read_only image2d_array_t uv_img, \
|
||||
__write_only image2d_array_t output, \
|
||||
global int* xRatio, \
|
||||
global int* yRatio, \
|
||||
global int* xOffset, \
|
||||
global int* yOffset, \
|
||||
float rMean, \
|
||||
float gMean, \
|
||||
float bMean, \
|
||||
float var, \
|
||||
int reverse_channel, \
|
||||
int trans \
|
||||
) \
|
||||
{ \
|
||||
int gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
\
|
||||
int sy = gidy + (*yOffset); \
|
||||
int sx = gidx + (*xOffset); \
|
||||
int uvX = sx & 0xfffffffe; \
|
||||
int uvY = sy >> 1; \
|
||||
\
|
||||
vxc_uchar16 Y, UV; \
|
||||
\
|
||||
VXC_ReadImage(Y, y_img, (int2)(sx,sy), 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), 0,VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
vxc_char16 tmpUV; \
|
||||
short tmpVal = 128; \
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \
|
||||
\
|
||||
float4 tmpDstB, tmpDstG, tmpDstR; \
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \
|
||||
\
|
||||
conv_type result; \
|
||||
dst_type dst0; \
|
||||
save_type dst; \
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
|
||||
tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstB); \
|
||||
dstPos.z = bOrder; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstG); \
|
||||
dstPos.z = 1; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstR); \
|
||||
dstPos.z = rOrder; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
NV12_COPY_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)
|
||||
NV12_COPY_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)
|
||||
NV12_COPY_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)
|
||||
NV12_COPY_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)
|
||||
|
|
@ -8,151 +8,195 @@ _viv_uniform float bMeanScaleVarZp;
|
|||
_viv_uniform float gMeanScaleVarZp;
|
||||
_viv_uniform float rMeanScaleVarZp;
|
||||
|
||||
_viv_uniform uint xrIntFloat_16;
|
||||
_viv_uniform uint yrIntFloat_16;
|
||||
_viv_uniform uint xrIntFloat_16;
|
||||
_viv_uniform uint yrIntFloat_16;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
|
||||
|
||||
__kernel void pre_process_nv12_scale_U8toI16(
|
||||
__read_only image2d_t y_img, __read_only image2d_t uv_img,
|
||||
__write_only image2d_array_t output,
|
||||
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
|
||||
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
|
||||
{
|
||||
uint4 gidx = get_global_id(0);
|
||||
uint gidy = get_global_id(1);
|
||||
gidx += (uint4)(0, 1, 2, 3);
|
||||
_viv_uniform VXC_512Bits uniCalculateYShift_2x8;
|
||||
_viv_uniform VXC_512Bits uniCalculateUVShift_2x8;
|
||||
|
||||
uint dy = (gidy * yrIntFloat_16) >> 16;
|
||||
uint4 dx = (gidx * xrIntFloat_16) >> 16;
|
||||
int sy = convert_int(dy) + (*yOffset);
|
||||
int4 sx = convert_int4(dx) + (*xOffset);
|
||||
int4 uvX = sx & 0xfffffffe;
|
||||
int uvY = sy >> 1;
|
||||
|
||||
vxc_uchar16 Y, UV;
|
||||
int2 coord = (int2)(sx.x, sy);
|
||||
int2 coord_uv = (int2)(uvX.x, uvY);
|
||||
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.y;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.z;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.w;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.y;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.z;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.w;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
vxc_char16 tmpUV;
|
||||
short tmpVal = 128;
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
|
||||
|
||||
float4 tmpDstB, tmpDstG, tmpDstR;
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
|
||||
|
||||
int4 result;
|
||||
vxc_short8 dst;
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
|
||||
result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
|
||||
dstPos.z = bOrder;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
|
||||
dstPos.z = 1;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
|
||||
dstPos.z = rOrder;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
#define NV12_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
|
||||
__kernel void pre_process_nv12_scale_##name##_gq \
|
||||
( \
|
||||
__read_only image2d_array_t y_img, \
|
||||
__read_only image2d_array_t uv_img, \
|
||||
__write_only image2d_array_t output, \
|
||||
global int* xRatio, \
|
||||
global int* yRatio, \
|
||||
global int* xOffset, \
|
||||
global int* yOffset, \
|
||||
float rMean, \
|
||||
float gMean, \
|
||||
float bMean, \
|
||||
float var, \
|
||||
int reverse_channel, \
|
||||
int trans \
|
||||
) \
|
||||
{ \
|
||||
uint4 gidx = get_global_id(0); \
|
||||
uint gidy = get_global_id(1); \
|
||||
gidx += (uint4)(0, 1, 2, 3); \
|
||||
\
|
||||
uint dy = (gidy * yrIntFloat_16) >> 16; \
|
||||
uint4 dx = (gidx * xrIntFloat_16) >> 16; \
|
||||
int sy = convert_int(dy) + (*yOffset); \
|
||||
int4 sx = convert_int4(dx) + (*xOffset); \
|
||||
int4 uvX = sx & 0xfffffffe; \
|
||||
int uvY = sy >> 1; \
|
||||
\
|
||||
vxc_uchar16 Y, UV; \
|
||||
int2 coord = (int2)(sx.x, sy); \
|
||||
int2 coord_uv = (int2)(uvX.x, uvY); \
|
||||
VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \
|
||||
vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \
|
||||
int4 offsetUV = uvX - uvX.x; \
|
||||
\
|
||||
vxc_ushort8 diffY, diffUV; \
|
||||
_viv_asm(COPY, diffY, sx, 16); \
|
||||
_viv_asm(COPY, diffUV, offsetUV, 16); \
|
||||
\
|
||||
vxc_ushort8 constData = 8; \
|
||||
VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
|
||||
uniCalculateYShift_2x8); \
|
||||
VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
|
||||
uniCalculateUVShift_2x8); \
|
||||
VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
vxc_char16 tmpUV; \
|
||||
short tmpVal = 128; \
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \
|
||||
\
|
||||
float4 tmpDstB, tmpDstG, tmpDstR; \
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \
|
||||
\
|
||||
conv_type result; \
|
||||
dst_type dst0; \
|
||||
save_type dst; \
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
|
||||
tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstB); \
|
||||
dstPos.z = bOrder; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstG); \
|
||||
dstPos.z = 1; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstR); \
|
||||
dstPos.z = rOrder; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
NV12_OPT_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)
|
||||
NV12_OPT_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)
|
||||
NV12_OPT_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)
|
||||
NV12_OPT_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)
|
||||
|
||||
__kernel void pre_process_nv12_scale_U8toF16(
|
||||
__read_only image2d_t y_img, __read_only image2d_t uv_img,
|
||||
__write_only image2d_array_t output,
|
||||
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
|
||||
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
|
||||
{
|
||||
uint4 gidx = get_global_id(0);
|
||||
uint gidy = get_global_id(1);
|
||||
gidx += (uint4)(0, 1, 2, 3);
|
||||
|
||||
uint dy = (gidy * yrIntFloat_16) >> 16;
|
||||
uint4 dx = (gidx * xrIntFloat_16) >> 16;
|
||||
int sy = convert_int(dy) + (*yOffset);
|
||||
int4 sx = convert_int4(dx) + (*xOffset);
|
||||
int4 uvX = sx & 0xfffffffe;
|
||||
int uvY = sy >> 1;
|
||||
|
||||
vxc_uchar16 Y, UV;
|
||||
int2 coord = (int2)(sx.x, sy);
|
||||
int2 coord_uv = (int2)(uvX.x, uvY);
|
||||
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.y;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.z;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.w;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.y;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.z;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.w;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
vxc_char16 tmpUV;
|
||||
short tmpVal = 128;
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
|
||||
|
||||
float4 tmpDstB, tmpDstG, tmpDstR;
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
|
||||
|
||||
tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp;
|
||||
tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp;
|
||||
tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp;
|
||||
|
||||
half4 result;
|
||||
vxc_half8 tmpdst;
|
||||
vxc_short8 dst;
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
|
||||
_viv_asm(CONV, result, tmpDstB);
|
||||
dstPos.z = bOrder;
|
||||
VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
|
||||
_viv_asm(COPY, dst, tmpdst, 16);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
_viv_asm(CONV, result, tmpDstG);
|
||||
dstPos.z = 1;
|
||||
VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
|
||||
_viv_asm(COPY, dst, tmpdst, 16);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
_viv_asm(CONV, result, tmpDstR);
|
||||
dstPos.z = rOrder;
|
||||
VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
|
||||
_viv_asm(COPY, dst, tmpdst, 16);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
#define NV12_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
|
||||
__kernel void pre_process_nv12_scale_##name \
|
||||
( \
|
||||
__read_only image2d_array_t y_img, \
|
||||
__read_only image2d_array_t uv_img, \
|
||||
__write_only image2d_array_t output, \
|
||||
global int* xRatio, \
|
||||
global int* yRatio, \
|
||||
global int* xOffset, \
|
||||
global int* yOffset, \
|
||||
float rMean, \
|
||||
float gMean, \
|
||||
float bMean, \
|
||||
float var, \
|
||||
int reverse_channel, \
|
||||
int trans \
|
||||
) \
|
||||
{ \
|
||||
uint4 gidx = get_global_id(0); \
|
||||
uint gidy = get_global_id(1); \
|
||||
gidx += (uint4)(0, 1, 2, 3); \
|
||||
\
|
||||
uint dy = (gidy * yrIntFloat_16) >> 16; \
|
||||
uint4 dx = (gidx * xrIntFloat_16) >> 16; \
|
||||
int sy = convert_int(dy) + (*yOffset); \
|
||||
int4 sx = convert_int4(dx) + (*xOffset); \
|
||||
int4 uvX = sx & 0xfffffffe; \
|
||||
int uvY = sy >> 1; \
|
||||
\
|
||||
vxc_uchar16 Y, UV; \
|
||||
int2 coord = (int2)(sx.x, sy); \
|
||||
int2 coord_uv = (int2)(uvX.x, uvY); \
|
||||
\
|
||||
VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord.x = sx.y; \
|
||||
VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord.x = sx.z; \
|
||||
VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord.x = sx.w; \
|
||||
VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_uv.x = uvX.y; \
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_uv.x = uvX.z; \
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_uv.x = uvX.w; \
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
vxc_char16 tmpUV; \
|
||||
short tmpVal = 128; \
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \
|
||||
\
|
||||
float4 tmpDstB, tmpDstG, tmpDstR; \
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \
|
||||
\
|
||||
conv_type result; \
|
||||
dst_type dst0; \
|
||||
save_type dst; \
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
|
||||
tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstB); \
|
||||
dstPos.z = bOrder; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstG); \
|
||||
dstPos.z = 1; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstR); \
|
||||
dstPos.z = rOrder; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
NV12_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)
|
||||
NV12_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)
|
||||
NV12_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)
|
||||
NV12_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)
|
||||
|
|
|
|||
|
|
@ -1,197 +0,0 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
|
||||
_viv_uniform float outputScaleVar;
|
||||
_viv_uniform float bMeanScaleVarZp;
|
||||
_viv_uniform float gMeanScaleVarZp;
|
||||
_viv_uniform float rMeanScaleVarZp;
|
||||
|
||||
_viv_uniform uint xrIntFloat_16;
|
||||
_viv_uniform uint yrIntFloat_16;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
|
||||
|
||||
__kernel void pre_process_nv12_scale_U8toU8(
|
||||
__read_only image2d_t y_img, __read_only image2d_t uv_img,
|
||||
__write_only image2d_array_t output,
|
||||
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
|
||||
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
|
||||
{
|
||||
uint4 gidx = get_global_id(0);
|
||||
uint gidy = get_global_id(1);
|
||||
gidx += (uint4)(0, 1, 2, 3);
|
||||
|
||||
uint dy = (gidy * yrIntFloat_16) >> 16;
|
||||
uint4 dx = (gidx * xrIntFloat_16) >> 16;
|
||||
int sy = convert_int(dy) + (*yOffset);
|
||||
int4 sx = convert_int4(dx) + (*xOffset);
|
||||
int4 uvX = sx & 0xfffffffe;
|
||||
int uvY = sy >> 1;
|
||||
|
||||
vxc_uchar16 Y, UV;
|
||||
int2 coord = (int2)(sx.x, sy);
|
||||
int2 coord_uv = (int2)(uvX.x, uvY);
|
||||
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.y;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.z;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.w;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.y;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.z;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.w;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
vxc_char16 tmpUV;
|
||||
short tmpVal = 128;
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
|
||||
|
||||
float4 tmpDstB, tmpDstG, tmpDstR;
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
|
||||
|
||||
int4 result;
|
||||
vxc_uchar8 dst;
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
|
||||
result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
|
||||
dstPos.z = bOrder;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
|
||||
dstPos.z = 1;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
|
||||
dstPos.z = rOrder;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void pre_process_nv12_copy_U8toU8(
|
||||
__read_only image2d_t y_img, __read_only image2d_t uv_img,
|
||||
__write_only image2d_array_t output,
|
||||
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
|
||||
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
|
||||
int sy = gidy + (*yOffset);
|
||||
int sx = gidx + (*xOffset);
|
||||
int uvX = sx & 0xfffffffe;
|
||||
int uvY = sy >> 1;
|
||||
|
||||
vxc_uchar16 Y, UV;
|
||||
|
||||
VXC_ReadImage(Y, y_img, (int2)(sx,sy), VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
vxc_char16 tmpUV;
|
||||
short tmpVal = 128;
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8);
|
||||
|
||||
float4 tmpDstB, tmpDstG, tmpDstR;
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
|
||||
|
||||
int4 result;
|
||||
vxc_uchar8 dst;
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
|
||||
result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
|
||||
dstPos.z = bOrder;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
|
||||
dstPos.z = 1;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
|
||||
dstPos.z = rOrder;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void pre_process_nv12_scale_U8toI8(
|
||||
__read_only image2d_t y_img, __read_only image2d_t uv_img,
|
||||
__write_only image2d_array_t output,
|
||||
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
|
||||
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
|
||||
{
|
||||
uint4 gidx = get_global_id(0);
|
||||
uint gidy = get_global_id(1);
|
||||
gidx += (uint4)(0, 1, 2, 3);
|
||||
|
||||
uint dy = (gidy * yrIntFloat_16) >> 16;
|
||||
uint4 dx = (gidx * xrIntFloat_16) >> 16;
|
||||
int sy = convert_int(dy) + (*yOffset);
|
||||
int4 sx = convert_int4(dx) + (*xOffset);
|
||||
int4 uvX = sx & 0xfffffffe;
|
||||
int uvY = sy >> 1;
|
||||
|
||||
vxc_uchar16 Y, UV;
|
||||
int2 coord = (int2)(sx.x, sy);
|
||||
int2 coord_uv = (int2)(uvX.x, uvY);
|
||||
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.y;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.z;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x = sx.w;
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.y;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.z;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
|
||||
coord_uv.x = uvX.w;
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
vxc_char16 tmpUV;
|
||||
short tmpVal = 128;
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
|
||||
|
||||
float4 tmpDstB, tmpDstG, tmpDstR;
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
|
||||
|
||||
int4 result;
|
||||
vxc_char8 dst;
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
|
||||
result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
|
||||
dstPos.z = bOrder;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
|
||||
dstPos.z = 1;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
|
||||
dstPos.z = rOrder;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -1,162 +0,0 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
|
||||
_viv_uniform float outputScaleVar;
|
||||
_viv_uniform float bMeanScaleVarZp;
|
||||
_viv_uniform float gMeanScaleVarZp;
|
||||
_viv_uniform float rMeanScaleVarZp;
|
||||
|
||||
_viv_uniform uint xrIntFloat_16;
|
||||
_viv_uniform uint yrIntFloat_16;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateYShift_2x8;
|
||||
_viv_uniform VXC_512Bits uniCalculateUVShift_2x8;
|
||||
|
||||
__kernel void pre_process_nv12_scale_U8toU8_gq(
|
||||
__read_only image2d_t y_img, __read_only image2d_t uv_img,
|
||||
__write_only image2d_array_t output,
|
||||
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
|
||||
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
|
||||
{
|
||||
uint4 gidx = get_global_id(0);
|
||||
uint gidy = get_global_id(1);
|
||||
gidx += (uint4)(0, 1, 2, 3);
|
||||
|
||||
uint dy = (gidy * yrIntFloat_16) >> 16;
|
||||
uint4 dx = (gidx * xrIntFloat_16) >> 16;
|
||||
int sy = convert_int(dy) + (*yOffset);
|
||||
int4 sx = convert_int4(dx) + (*xOffset);
|
||||
int4 uvX = sx & 0xfffffffe;
|
||||
int uvY = sy >> 1;
|
||||
|
||||
vxc_uchar16 Y, UV;
|
||||
int2 coord = (int2)(sx.x, sy);
|
||||
int2 coord_uv = (int2)(uvX.x, uvY);
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
|
||||
vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
|
||||
int4 offsetUV = uvX - uvX.x;
|
||||
|
||||
vxc_ushort8 diffY, diffUV;
|
||||
_viv_asm(COPY, diffY, sx, 16);
|
||||
_viv_asm(COPY, diffUV, offsetUV, 16);
|
||||
|
||||
vxc_ushort8 constData = 8;
|
||||
VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8);
|
||||
VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8);
|
||||
VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
vxc_char16 tmpUV;
|
||||
short tmpVal = 128;
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
|
||||
|
||||
float4 tmpDstB, tmpDstG, tmpDstR;
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
|
||||
|
||||
int4 result;
|
||||
vxc_uchar8 dst;
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
|
||||
result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
|
||||
dstPos.z = bOrder;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
|
||||
dstPos.z = 1;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
|
||||
dstPos.z = rOrder;
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void pre_process_nv12_scale_U8toF16_gq(
|
||||
__read_only image2d_t y_img, __read_only image2d_t uv_img,
|
||||
__write_only image2d_array_t output,
|
||||
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
|
||||
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
|
||||
{
|
||||
uint4 gidx = get_global_id(0);
|
||||
uint gidy = get_global_id(1);
|
||||
gidx += (uint4)(0, 1, 2, 3);
|
||||
|
||||
uint dy = (gidy * yrIntFloat_16) >> 16;
|
||||
uint4 dx = (gidx * xrIntFloat_16) >> 16;
|
||||
int sy = convert_int(dy) + (*yOffset);
|
||||
int4 sx = convert_int4(dx) + (*xOffset);
|
||||
int4 uvX = sx & 0xfffffffe;
|
||||
int uvY = sy >> 1;
|
||||
|
||||
vxc_uchar16 Y, UV;
|
||||
int2 coord = (int2)(sx.x, sy);
|
||||
int2 coord_uv = (int2)(uvX.x, uvY);
|
||||
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
|
||||
vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
|
||||
int4 offsetUV = uvX - uvX.x;
|
||||
|
||||
vxc_ushort8 diffY, diffUV;
|
||||
_viv_asm(COPY, diffY, sx, 16);
|
||||
_viv_asm(COPY, diffUV, offsetUV, 16);
|
||||
|
||||
vxc_ushort8 constData = 8;
|
||||
VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8);
|
||||
VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8);
|
||||
VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
vxc_char16 tmpUV;
|
||||
short tmpVal = 128;
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
|
||||
|
||||
float4 tmpDstB, tmpDstG, tmpDstR;
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
|
||||
|
||||
tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp;
|
||||
tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp;
|
||||
tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp;
|
||||
|
||||
half4 result;
|
||||
vxc_half8 tmpdst;
|
||||
vxc_short8 dst;
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
|
||||
_viv_asm(CONV, result, tmpDstB);
|
||||
dstPos.z = bOrder;
|
||||
VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
|
||||
_viv_asm(COPY, dst, tmpdst, 16);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
_viv_asm(CONV, result, tmpDstG);
|
||||
dstPos.z = 1;
|
||||
VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
|
||||
_viv_asm(COPY, dst, tmpdst, 16);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
_viv_asm(CONV, result, tmpDstR);
|
||||
dstPos.z = rOrder;
|
||||
VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
|
||||
_viv_asm(COPY, dst, tmpdst, 16);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -0,0 +1,238 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpR3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpR4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpG1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpG2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpG3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpG4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateG1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateG2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateG3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateG4th_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpB1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpB2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8HiG_2x8;
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8LoR_2x8;
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8;
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
_viv_uniform float output_zp;
|
||||
_viv_uniform float output_scale;
|
||||
|
||||
#define YUV420_COPY_SH_IMPL(name, dst_type) \
|
||||
__kernel void pre_process_yuv420_copy_##name \
|
||||
( \
|
||||
__read_only image2d_array_t y_img, \
|
||||
__read_only image2d_array_t u_img, \
|
||||
__read_only image2d_array_t v_img, \
|
||||
__write_only image2d_array_t output, \
|
||||
global int * xRatio, \
|
||||
global int * yRatio, \
|
||||
global int * xOffset, \
|
||||
global int * yOffset, \
|
||||
float rMean, \
|
||||
float gMean, \
|
||||
float bMean, \
|
||||
float var, \
|
||||
int reverse_channel, \
|
||||
int trans \
|
||||
) \
|
||||
{ \
|
||||
int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \
|
||||
int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); \
|
||||
vxc_uchar16 Y; \
|
||||
vxc_uchar8 U, V; \
|
||||
vxc_int4 C0, C1, C2, C3; \
|
||||
vxc_uchar16 R, G, B; \
|
||||
dst_type dst0, dst1, dst2; \
|
||||
\
|
||||
VXC_ReadImage(Y, y_img, pos.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(U, u_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
/*C = Y - 16;*/ \
|
||||
/*D = U - 128;*/ \
|
||||
/*E = V - 128;*/ \
|
||||
/* calculate R*/ \
|
||||
/* ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]*/ \
|
||||
int tmpV = -56992; \
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); \
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); \
|
||||
\
|
||||
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
\
|
||||
/* calculate G*/ \
|
||||
/* ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]*/ \
|
||||
/* 298Y - 208V*/ \
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); \
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); \
|
||||
/* 34784 - 100U*/ \
|
||||
ushort tmpG = 34784; \
|
||||
vxc_ushort8 tmpDstG; \
|
||||
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \
|
||||
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); \
|
||||
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); \
|
||||
VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); \
|
||||
VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); \
|
||||
\
|
||||
/* calculate B*/ \
|
||||
/* ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]*/ \
|
||||
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); \
|
||||
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); \
|
||||
tmpV = -70688; \
|
||||
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
\
|
||||
var *= output_scale; \
|
||||
float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \
|
||||
rMean * var - output_zp, var); \
|
||||
half4 paramData_f16; \
|
||||
_viv_asm(CONV, paramData_f16, paramData); \
|
||||
\
|
||||
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \
|
||||
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \
|
||||
\
|
||||
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \
|
||||
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \
|
||||
\
|
||||
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \
|
||||
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \
|
||||
\
|
||||
pos = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
|
||||
pos.z = bOrder; \
|
||||
VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
pos.z = 1; \
|
||||
VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
pos.z = rOrder; \
|
||||
VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
YUV420_COPY_SH_IMPL(U8toU8, vxc_uchar16)
|
||||
YUV420_COPY_SH_IMPL(U8toI8, vxc_char16)
|
||||
|
||||
#define YUV420_COPY_16BITS_SH_IMPL(name, dst_type) \
|
||||
__kernel void pre_process_yuv420_copy_##name \
|
||||
( \
|
||||
__read_only image2d_array_t y_img, \
|
||||
__read_only image2d_array_t u_img, \
|
||||
__read_only image2d_array_t v_img, \
|
||||
__write_only image2d_array_t output, \
|
||||
global int * xRatio, \
|
||||
global int * yRatio, \
|
||||
global int * xOffset, \
|
||||
global int * yOffset, \
|
||||
float rMean, \
|
||||
float gMean, \
|
||||
float bMean, \
|
||||
float var, \
|
||||
int reverse_channel, \
|
||||
int trans \
|
||||
) \
|
||||
{ \
|
||||
int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \
|
||||
int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); \
|
||||
vxc_uchar16 Y; \
|
||||
vxc_uchar8 U, V; \
|
||||
vxc_int4 C0, C1, C2, C3; \
|
||||
vxc_uchar16 R, G, B; \
|
||||
dst_type dst0, dst1, dst2, dst3, dst4, dst5; \
|
||||
vxc_short8 out0, out1, out2, out3, out4, out5; \
|
||||
\
|
||||
VXC_ReadImage(Y, y_img, pos.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(U, u_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
int tmpV = -56992; \
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); \
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); \
|
||||
\
|
||||
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
\
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); \
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); \
|
||||
\
|
||||
ushort tmpG = 34784; \
|
||||
vxc_ushort8 tmpDstG; \
|
||||
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \
|
||||
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); \
|
||||
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); \
|
||||
VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); \
|
||||
VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); \
|
||||
\
|
||||
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); \
|
||||
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); \
|
||||
tmpV = -70688; \
|
||||
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
\
|
||||
var *= output_scale; \
|
||||
float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \
|
||||
rMean * var - output_zp, var); \
|
||||
half4 paramData_f16; \
|
||||
_viv_asm(CONV, paramData_f16, paramData); \
|
||||
\
|
||||
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \
|
||||
VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \
|
||||
\
|
||||
VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \
|
||||
VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \
|
||||
\
|
||||
VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \
|
||||
VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \
|
||||
\
|
||||
_viv_asm(COPY, out0, dst0, 16); \
|
||||
_viv_asm(COPY, out1, dst1, 16); \
|
||||
_viv_asm(COPY, out2, dst2, 16); \
|
||||
_viv_asm(COPY, out3, dst3, 16); \
|
||||
_viv_asm(COPY, out4, dst4, 16); \
|
||||
_viv_asm(COPY, out5, dst5, 16); \
|
||||
\
|
||||
pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8); \
|
||||
VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
pos.z = 1; \
|
||||
VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
pos.z = rOrder; \
|
||||
VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
YUV420_COPY_16BITS_SH_IMPL(U8toF16, vxc_half8)
|
||||
YUV420_COPY_16BITS_SH_IMPL(U8toI16, vxc_short8)
|
||||
|
|
@ -1,240 +0,0 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpR3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpR4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpG1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpG2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpG3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpG4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateG1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateG2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateG3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateG4th_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpB1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpB2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8HiG_2x8;
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8LoR_2x8;
|
||||
_viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8;
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
_viv_uniform int zp;
|
||||
_viv_uniform float outputScale;
|
||||
|
||||
__kernel void pre_process_yuv420_copy_U8toU8(
|
||||
__read_only image2d_t y_img,
|
||||
__read_only image2d_t u_img,
|
||||
__read_only image2d_t v_img,
|
||||
__write_only image2d_array_t output,
|
||||
global int * xRatio,
|
||||
global int * yRatio,
|
||||
global int * xOffset,
|
||||
global int * yOffset,
|
||||
float rMean,
|
||||
float gMean,
|
||||
float bMean,
|
||||
float var,
|
||||
int reverse_channel,
|
||||
int trans
|
||||
)
|
||||
{
|
||||
int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
|
||||
int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);
|
||||
vxc_uchar16 Y;
|
||||
vxc_uchar8 U, V;
|
||||
vxc_int4 C0, C1, C2, C3;
|
||||
vxc_uchar16 R, G, B;
|
||||
vxc_uchar16 dst0, dst1, dst2;
|
||||
|
||||
VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
//C = Y - 16;
|
||||
//D = U - 128;
|
||||
//E = V - 128;
|
||||
// calculate R
|
||||
// ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]
|
||||
int tmpV = -56992;
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
|
||||
|
||||
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
|
||||
// calculate G
|
||||
// ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
|
||||
// 298Y - 208V
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
|
||||
// 34784 - 100U
|
||||
ushort tmpG = 34784;
|
||||
vxc_ushort8 tmpDstG;
|
||||
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
|
||||
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
|
||||
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
|
||||
VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);
|
||||
VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);
|
||||
|
||||
// calculate B
|
||||
// ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
|
||||
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
|
||||
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
|
||||
tmpV = -70688;
|
||||
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
|
||||
var *= outputScale;
|
||||
float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\
|
||||
rMean * var - zp, var);
|
||||
half4 paramData_f16;
|
||||
_viv_asm(CONV, paramData_f16, paramData);
|
||||
|
||||
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
|
||||
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
|
||||
|
||||
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
|
||||
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
|
||||
|
||||
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
|
||||
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
|
||||
|
||||
pos = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
pos.z = bOrder;
|
||||
VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
pos.z = 1;
|
||||
VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
pos.z = rOrder;
|
||||
VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void pre_process_yuv420_copy_U8toF16(
|
||||
__read_only image2d_t y_img,
|
||||
__read_only image2d_t u_img,
|
||||
__read_only image2d_t v_img,
|
||||
__write_only image2d_array_t output,
|
||||
global int * xRatio,
|
||||
global int * yRatio,
|
||||
global int * xOffset,
|
||||
global int * yOffset,
|
||||
float rMean,
|
||||
float gMean,
|
||||
float bMean,
|
||||
float var,
|
||||
int reverse_channel,
|
||||
int trans
|
||||
)
|
||||
{
|
||||
int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
|
||||
int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);
|
||||
vxc_uchar16 Y;
|
||||
vxc_uchar8 U, V;
|
||||
vxc_int4 C0, C1, C2, C3;
|
||||
vxc_uchar16 R, G, B;
|
||||
vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5;
|
||||
vxc_short8 out0, out1, out2, out3, out4, out5;
|
||||
|
||||
VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
//C = Y - 16;
|
||||
//D = U - 128;
|
||||
//E = V - 128;
|
||||
// calculate R
|
||||
// ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]
|
||||
int tmpV = -56992;
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
|
||||
|
||||
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
|
||||
// calculate G
|
||||
// ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
|
||||
// 298Y - 208V
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
|
||||
// 34784 - 100U
|
||||
ushort tmpG = 34784;
|
||||
vxc_ushort8 tmpDstG;
|
||||
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
|
||||
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
|
||||
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
|
||||
VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);
|
||||
VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);
|
||||
|
||||
// calculate B
|
||||
// ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
|
||||
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
|
||||
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
|
||||
tmpV = -70688;
|
||||
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
|
||||
float4 paramData = (float4)(bMean * var, gMean * var,\
|
||||
rMean * var, var);
|
||||
half4 paramData_f16;
|
||||
_viv_asm(CONV, paramData_f16, paramData);
|
||||
|
||||
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
|
||||
VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
|
||||
|
||||
VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
|
||||
VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
|
||||
|
||||
VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
|
||||
VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
|
||||
|
||||
_viv_asm(COPY, out0, dst0, 16);
|
||||
_viv_asm(COPY, out1, dst1, 16);
|
||||
_viv_asm(COPY, out2, dst2, 16);
|
||||
_viv_asm(COPY, out3, dst3, 16);
|
||||
_viv_asm(COPY, out4, dst4, 16);
|
||||
_viv_asm(COPY, out5, dst5, 16);
|
||||
|
||||
pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8);
|
||||
VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
pos.z = 1;
|
||||
VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
pos.z = rOrder;
|
||||
VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,237 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniDescaleU8_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
_viv_uniform float output_zp;
|
||||
_viv_uniform float output_scale;
|
||||
|
||||
#define YUV420_SCALE_8BITS_SH_IMPL(name, dst_type) \
|
||||
__kernel void pre_process_yuv420_scale_##name \
|
||||
( \
|
||||
__read_only image2d_array_t y_img, \
|
||||
__read_only image2d_array_t u_img, \
|
||||
__read_only image2d_array_t v_img, \
|
||||
__write_only image2d_array_t output, \
|
||||
global int * xRatio, \
|
||||
global int * yRatio, \
|
||||
global int * xOffset, \
|
||||
global int * yOffset, \
|
||||
float rMean, \
|
||||
float gMean, \
|
||||
float bMean, \
|
||||
float var, \
|
||||
int reverse_channel, \
|
||||
int trans \
|
||||
) \
|
||||
{ \
|
||||
int4 gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
gidx += (int4)(0, 1, 2, 3); \
|
||||
\
|
||||
int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \
|
||||
int4 sx = fx & 0xffff8000; \
|
||||
int fy, sy; \
|
||||
fx -= sx; \
|
||||
sx = sx >> 15; \
|
||||
fx = (fx +(1 << 4)) >> 5; \
|
||||
\
|
||||
fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \
|
||||
sy = fy & 0xffff8000; \
|
||||
fy -= sy; \
|
||||
sy = sy >> 15; \
|
||||
\
|
||||
sy = sy < 0 ? 0 : sy; \
|
||||
fy = fy < 0 ? 0 : fy; \
|
||||
\
|
||||
fy = (fy + (1<< 4)) >> 5; \
|
||||
sx += (*xOffset); \
|
||||
sy += (*yOffset); \
|
||||
int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); \
|
||||
int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); \
|
||||
int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); \
|
||||
\
|
||||
vxc_uchar16 Y, U, V; \
|
||||
vxc_int4 C0, C1, C2, C3; \
|
||||
vxc_uchar16 R, G, B; \
|
||||
\
|
||||
VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos1.x = (sx.x + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos2.x = (sx.x + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
srcPos.x = sx.y; \
|
||||
srcPos1.x = sx.y >> 1; \
|
||||
srcPos2.x = sx.y >> 1; \
|
||||
VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos1.x = (sx.y + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos2.x = (sx.y + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
srcPos.x = sx.z; \
|
||||
srcPos1.x = sx.z >> 1; \
|
||||
srcPos2.x = sx.z >> 1; \
|
||||
VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos1.x = (sx.z + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos2.x = (sx.z + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
srcPos.x = sx.w; \
|
||||
srcPos1.x = sx.w >> 1; \
|
||||
srcPos2.x = sx.w >> 1; \
|
||||
VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos1.x = (sx.w + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos2.x = (sx.w + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
int tmpV = -56992; \
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \
|
||||
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
\
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \
|
||||
\
|
||||
ushort tmpG = 34784; \
|
||||
vxc_ushort8 tmpDstG, tmpDstG1; \
|
||||
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \
|
||||
VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \
|
||||
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
|
||||
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
|
||||
VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
|
||||
VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
|
||||
\
|
||||
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \
|
||||
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \
|
||||
tmpV = -70688; \
|
||||
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
\
|
||||
int4 result, temp1, temp2; \
|
||||
int4 tmpData0, tmpData1; \
|
||||
\
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
|
||||
temp1 = fx * tmpData0 + tmpData1; \
|
||||
\
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
|
||||
temp2 = fx * tmpData0 + tmpData1; \
|
||||
result = fy * temp2 + (temp1 << 10); \
|
||||
\
|
||||
tmpV = 1 << 19; \
|
||||
dst_type dst; \
|
||||
float4 tmpDst; \
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
|
||||
tmpDst = (tmpDst - bMean) * var; \
|
||||
dstPos.z = bOrder; \
|
||||
result = convert_int4_rte(tmpDst * output_scale + output_zp); \
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
|
||||
temp1 = fx * tmpData0 + tmpData1; \
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
|
||||
temp2 = fx * tmpData0 + tmpData1; \
|
||||
result = fy * temp2 + (temp1 << 10); \
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
|
||||
tmpDst = (tmpDst - gMean) * var; \
|
||||
dstPos.z = 1; \
|
||||
result = convert_int4_rte(tmpDst * output_scale + output_zp); \
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
|
||||
temp1 = fx * tmpData0 + tmpData1; \
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
|
||||
temp2 = fx * tmpData0 + tmpData1; \
|
||||
result = fy * temp2 + (temp1 << 10); \
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
|
||||
tmpDst = (tmpDst - rMean) * var; \
|
||||
dstPos.z = rOrder; \
|
||||
result = convert_int4_rte(tmpDst * output_scale + output_zp); \
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
YUV420_SCALE_8BITS_SH_IMPL(U8toU8, vxc_uchar8)
|
||||
YUV420_SCALE_8BITS_SH_IMPL(U8toI8, vxc_char8)
|
||||
|
|
@ -0,0 +1,245 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniDescaleU8_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
_viv_uniform float output_scale;
|
||||
_viv_uniform float output_zp;
|
||||
|
||||
#define YUV420_SCALE_16BITS_SH_IMPL(name, dst_type, conv_type) \
|
||||
__kernel void pre_process_yuv420_scale_##name \
|
||||
( \
|
||||
__read_only image2d_array_t y_img, \
|
||||
__read_only image2d_array_t u_img, \
|
||||
__read_only image2d_array_t v_img, \
|
||||
__write_only image2d_array_t output, \
|
||||
global int * xRatio, \
|
||||
global int * yRatio, \
|
||||
global int * xOffset, \
|
||||
global int * yOffset, \
|
||||
float rMean, \
|
||||
float gMean, \
|
||||
float bMean, \
|
||||
float var, \
|
||||
int reverse_channel, \
|
||||
int trans \
|
||||
) \
|
||||
{ \
|
||||
int4 gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
gidx += (int4)(0, 1, 2, 3); \
|
||||
\
|
||||
int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \
|
||||
int4 sx = fx & 0xffff8000; \
|
||||
int fy, sy; \
|
||||
fx -= sx; \
|
||||
sx = sx >> 15; \
|
||||
fx = (fx +(1 << 4)) >> 5; \
|
||||
\
|
||||
fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \
|
||||
sy = fy & 0xffff8000; \
|
||||
fy -= sy; \
|
||||
sy = sy >> 15; \
|
||||
\
|
||||
sy = sy < 0 ? 0 : sy; \
|
||||
fy = fy < 0 ? 0 : fy; \
|
||||
\
|
||||
fy = (fy + (1<< 4)) >> 5; \
|
||||
sx += (*xOffset); \
|
||||
sy += (*yOffset); \
|
||||
int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); \
|
||||
int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); \
|
||||
int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); \
|
||||
\
|
||||
vxc_uchar16 Y, U, V; \
|
||||
vxc_int4 C0, C1, C2, C3; \
|
||||
vxc_uchar16 R, G, B; \
|
||||
\
|
||||
VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos1.x = (sx.x + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos2.x = (sx.x + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
srcPos.x = sx.y; \
|
||||
srcPos1.x = sx.y >> 1; \
|
||||
srcPos2.x = sx.y >> 1; \
|
||||
VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos1.x = (sx.y + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos2.x = (sx.y + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
srcPos.x = sx.z; \
|
||||
srcPos1.x = sx.z >> 1; \
|
||||
srcPos2.x = sx.z >> 1; \
|
||||
VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos1.x = (sx.z + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos2.x = (sx.z + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
srcPos.x = sx.w; \
|
||||
srcPos1.x = sx.w >> 1; \
|
||||
srcPos2.x = sx.w >> 1; \
|
||||
VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos1.x = (sx.w + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \
|
||||
srcPos2.x = (sx.w + 1) >> 1; \
|
||||
VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
int tmpV = -56992; \
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \
|
||||
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
|
||||
\
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \
|
||||
\
|
||||
ushort tmpG = 34784; \
|
||||
vxc_ushort8 tmpDstG, tmpDstG1; \
|
||||
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \
|
||||
VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \
|
||||
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
|
||||
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
|
||||
VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
|
||||
VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
|
||||
\
|
||||
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \
|
||||
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \
|
||||
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \
|
||||
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \
|
||||
tmpV = -70688; \
|
||||
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
|
||||
\
|
||||
int4 result, temp1, temp2; \
|
||||
int4 tmpData0, tmpData1; \
|
||||
dst_type tmpResult; \
|
||||
conv_type tmpVal; \
|
||||
\
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
|
||||
temp1 = fx * tmpData0 + tmpData1; \
|
||||
\
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
|
||||
temp2 = fx * tmpData0 + tmpData1; \
|
||||
result = fy * temp2 + (temp1 << 10); \
|
||||
\
|
||||
tmpV = 1 << 19; \
|
||||
vxc_short8 dst; \
|
||||
float4 tmpDst; \
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
|
||||
tmpDst = (tmpDst - bMean) * var; \
|
||||
dstPos.z = bOrder; \
|
||||
tmpDst = tmpDst * output_scale + output_zp; \
|
||||
_viv_asm(CONV_RTE, tmpVal, tmpDst); \
|
||||
VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, tmpResult, 8); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
|
||||
temp1 = fx * tmpData0 + tmpData1; \
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
|
||||
temp2 = fx * tmpData0 + tmpData1; \
|
||||
result = fy * temp2 + (temp1 << 10); \
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
|
||||
tmpDst = (tmpDst - gMean) * var; \
|
||||
dstPos.z = 1; \
|
||||
tmpDst = tmpDst * output_scale + output_zp; \
|
||||
_viv_asm(CONV_RTE, tmpVal, tmpDst); \
|
||||
VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, tmpResult, 8); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
|
||||
temp1 = fx * tmpData0 + tmpData1; \
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
|
||||
temp2 = fx * tmpData0 + tmpData1; \
|
||||
result = fy * temp2 + (temp1 << 10); \
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
|
||||
tmpDst = (tmpDst - rMean) * var; \
|
||||
dstPos.z = rOrder; \
|
||||
tmpDst = tmpDst * output_scale + output_zp; \
|
||||
_viv_asm(CONV_RTE, tmpVal, tmpDst); \
|
||||
VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, tmpResult, 8); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
YUV420_SCALE_16BITS_SH_IMPL(U8toF16, vxc_half8, half4)
|
||||
YUV420_SCALE_16BITS_SH_IMPL(U8toI16, vxc_short8, int4)
|
||||
|
|
@ -1,232 +0,0 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniDescaleU8_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
|
||||
__kernel void pre_process_yuv420_scale_U8toF16(
|
||||
__read_only image2d_array_t y_img, __read_only image2d_array_t u_img,
|
||||
__read_only image2d_array_t v_img, __write_only image2d_array_t output,
|
||||
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
|
||||
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
|
||||
{
|
||||
int4 gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
gidx += (int4)(0, 1, 2, 3);
|
||||
|
||||
int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
|
||||
int4 sx = fx & 0xffff8000; // Floor
|
||||
int fy, sy;
|
||||
fx -= sx;
|
||||
sx = sx >> 15;
|
||||
fx = (fx +(1 << 4)) >> 5;
|
||||
|
||||
// for y
|
||||
fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
|
||||
sy = fy & 0xffff8000; // Floor
|
||||
fy -= sy;
|
||||
sy = sy >> 15;
|
||||
|
||||
sy = sy < 0 ? 0 : sy;
|
||||
fy = fy < 0 ? 0 : fy;
|
||||
|
||||
fy = (fy + (1<< 4)) >> 5;
|
||||
sx += (*xOffset);
|
||||
sy += (*yOffset);
|
||||
int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);
|
||||
int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);
|
||||
int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);
|
||||
|
||||
vxc_uchar16 Y, U, V;
|
||||
vxc_int4 C0, C1, C2, C3;
|
||||
vxc_uchar16 R, G, B;
|
||||
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.x + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.x + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.y;
|
||||
srcPos1.x = sx.y >> 1;
|
||||
srcPos2.x = sx.y >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.y + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.y + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.z;
|
||||
srcPos1.x = sx.z >> 1;
|
||||
srcPos2.x = sx.z >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.z + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.z + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.w;
|
||||
srcPos1.x = sx.w >> 1;
|
||||
srcPos2.x = sx.w >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.w + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.w + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
//C = Y - 16; D = U - 128; E = V - 128;
|
||||
// ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]
|
||||
int tmpV = -56992;
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);
|
||||
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
|
||||
// ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
|
||||
// 298Y - 208V
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);
|
||||
// 34784 - 100U
|
||||
ushort tmpG = 34784;
|
||||
vxc_ushort8 tmpDstG, tmpDstG1;
|
||||
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
|
||||
VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);
|
||||
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
|
||||
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
|
||||
VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
|
||||
VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
|
||||
|
||||
// ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
|
||||
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);
|
||||
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);
|
||||
tmpV = -70688;
|
||||
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
|
||||
int4 result, temp1, temp2;
|
||||
int4 tmpData0, tmpData1;
|
||||
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
// temp2 - temp1
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
|
||||
vxc_half8 tmpVal;
|
||||
half4 hDst;
|
||||
tmpV = 1 << 19;
|
||||
vxc_short8 dst;
|
||||
float4 tmpDst;
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - bMean) * var;
|
||||
dstPos.z = bOrder;
|
||||
_viv_asm(CONV, hDst, tmpDst);
|
||||
VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
|
||||
_viv_asm(COPY, dst, tmpVal, 16);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - gMean) * var;
|
||||
dstPos.z = 1;
|
||||
_viv_asm(CONV, hDst, tmpDst);
|
||||
VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
|
||||
_viv_asm(COPY, dst, tmpVal, 16);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - rMean) * var;
|
||||
dstPos.z = rOrder;
|
||||
_viv_asm(CONV, hDst, tmpDst);
|
||||
VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
|
||||
_viv_asm(COPY, dst, tmpVal, 16);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -1,227 +0,0 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
|
||||
_viv_uniform VXC_512Bits uniDescaleU8_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
_viv_uniform float outputScale;
|
||||
|
||||
__kernel void pre_process_yuv420_scale_U8toI16(
|
||||
__read_only image2d_array_t y_img, __read_only image2d_array_t u_img,
|
||||
__read_only image2d_array_t v_img, __write_only image2d_array_t output,
|
||||
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
|
||||
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
|
||||
{
|
||||
int4 gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
gidx += (int4)(0, 1, 2, 3);
|
||||
|
||||
int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
|
||||
int4 sx = fx & 0xffff8000; // Floor
|
||||
int fy, sy;
|
||||
fx -= sx;
|
||||
sx = sx >> 15;
|
||||
fx = (fx +(1 << 4)) >> 5;
|
||||
|
||||
// for y
|
||||
fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
|
||||
sy = fy & 0xffff8000; // Floor
|
||||
fy -= sy;
|
||||
sy = sy >> 15;
|
||||
|
||||
sy = sy < 0 ? 0 : sy;
|
||||
fy = fy < 0 ? 0 : fy;
|
||||
|
||||
fy = (fy + (1<< 4)) >> 5;
|
||||
sx += (*xOffset);
|
||||
sy += (*yOffset);
|
||||
int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);
|
||||
int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);
|
||||
int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);
|
||||
|
||||
vxc_uchar16 Y, U, V;
|
||||
vxc_int4 C0, C1, C2, C3;
|
||||
vxc_uchar16 R, G, B;
|
||||
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.x + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.x + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.y;
|
||||
srcPos1.x = sx.y >> 1;
|
||||
srcPos2.x = sx.y >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.y + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.y + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.z;
|
||||
srcPos1.x = sx.z >> 1;
|
||||
srcPos2.x = sx.z >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.z + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.z + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.w;
|
||||
srcPos1.x = sx.w >> 1;
|
||||
srcPos2.x = sx.w >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.w + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.w + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
//C = Y - 16; D = U - 128; E = V - 128;
|
||||
// ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]
|
||||
int tmpV = -56992;
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);
|
||||
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
|
||||
// ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
|
||||
// 298Y - 208V
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);
|
||||
// 34784 - 100U
|
||||
ushort tmpG = 34784;
|
||||
vxc_ushort8 tmpDstG, tmpDstG1;
|
||||
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
|
||||
VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);
|
||||
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
|
||||
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
|
||||
VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
|
||||
VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
|
||||
|
||||
// ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
|
||||
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);
|
||||
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);
|
||||
tmpV = -70688;
|
||||
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
|
||||
int4 result, temp1, temp2;
|
||||
int4 tmpData0, tmpData1;
|
||||
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
// temp2 - temp1
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
|
||||
tmpV = 1 << 19;
|
||||
vxc_short8 dst;
|
||||
float4 tmpDst;
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - bMean) * var;
|
||||
dstPos.z = bOrder;
|
||||
result = convert_int4_rte(tmpDst * outputScale);
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - gMean) * var;
|
||||
dstPos.z = 1;
|
||||
result = convert_int4_rte(tmpDst * outputScale);
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - rMean) * var;
|
||||
dstPos.z = rOrder;
|
||||
result = convert_int4_rte(tmpDst * outputScale);
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -1,227 +0,0 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
|
||||
_viv_uniform VXC_512Bits uniDescaleU8_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
_viv_uniform float outputScale;
|
||||
|
||||
__kernel void pre_process_yuv420_scale_U8toI8(
|
||||
__read_only image2d_array_t y_img, __read_only image2d_array_t u_img,
|
||||
__read_only image2d_array_t v_img, __write_only image2d_array_t output,
|
||||
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
|
||||
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
|
||||
{
|
||||
int4 gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
gidx += (int4)(0, 1, 2, 3);
|
||||
|
||||
int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
|
||||
int4 sx = fx & 0xffff8000; // Floor
|
||||
int fy, sy;
|
||||
fx -= sx;
|
||||
sx = sx >> 15;
|
||||
fx = (fx +(1 << 4)) >> 5;
|
||||
|
||||
// for y
|
||||
fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
|
||||
sy = fy & 0xffff8000; // Floor
|
||||
fy -= sy;
|
||||
sy = sy >> 15;
|
||||
|
||||
sy = sy < 0 ? 0 : sy;
|
||||
fy = fy < 0 ? 0 : fy;
|
||||
|
||||
fy = (fy + (1<< 4)) >> 5;
|
||||
sx += (*xOffset);
|
||||
sy += (*yOffset);
|
||||
int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);
|
||||
int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);
|
||||
int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);
|
||||
|
||||
vxc_uchar16 Y, U, V;
|
||||
vxc_int4 C0, C1, C2, C3;
|
||||
vxc_uchar16 R, G, B;
|
||||
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.x + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.x + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.y;
|
||||
srcPos1.x = sx.y >> 1;
|
||||
srcPos2.x = sx.y >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.y + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.y + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.z;
|
||||
srcPos1.x = sx.z >> 1;
|
||||
srcPos2.x = sx.z >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.z + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.z + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.w;
|
||||
srcPos1.x = sx.w >> 1;
|
||||
srcPos2.x = sx.w >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.w + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.w + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
//C = Y - 16; D = U - 128; E = V - 128;
|
||||
// ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]
|
||||
int tmpV = -56992;
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);
|
||||
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
|
||||
// ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
|
||||
// 298Y - 208V
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);
|
||||
// 34784 - 100U
|
||||
ushort tmpG = 34784;
|
||||
vxc_ushort8 tmpDstG, tmpDstG1;
|
||||
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
|
||||
VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);
|
||||
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
|
||||
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
|
||||
VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
|
||||
VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
|
||||
|
||||
// ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
|
||||
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);
|
||||
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);
|
||||
tmpV = -70688;
|
||||
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
|
||||
int4 result, temp1, temp2;
|
||||
int4 tmpData0, tmpData1;
|
||||
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
// temp2 - temp1
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
|
||||
tmpV = 1 << 19;
|
||||
vxc_char8 dst;
|
||||
float4 tmpDst;
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - bMean) * var;
|
||||
dstPos.z = bOrder;
|
||||
result = convert_int4_rte(tmpDst * outputScale);
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - gMean) * var;
|
||||
dstPos.z = 1;
|
||||
result = convert_int4_rte(tmpDst * outputScale);
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - rMean) * var;
|
||||
dstPos.z = rOrder;
|
||||
result = convert_int4_rte(tmpDst * outputScale);
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -1,228 +0,0 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
|
||||
_viv_uniform VXC_512Bits uniDescaleU8_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
|
||||
_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
_viv_uniform int zp;
|
||||
_viv_uniform float outputScale;
|
||||
|
||||
__kernel void pre_process_yuv420_scale_U8toU8(
|
||||
__read_only image2d_array_t y_img, __read_only image2d_array_t u_img,
|
||||
__read_only image2d_array_t v_img, __write_only image2d_array_t output,
|
||||
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
|
||||
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
|
||||
{
|
||||
int4 gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
gidx += (int4)(0, 1, 2, 3);
|
||||
|
||||
int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
|
||||
int4 sx = fx & 0xffff8000; // Floor
|
||||
int fy, sy;
|
||||
fx -= sx;
|
||||
sx = sx >> 15;
|
||||
fx = (fx +(1 << 4)) >> 5;
|
||||
|
||||
// for y
|
||||
fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
|
||||
sy = fy & 0xffff8000; // Floor
|
||||
fy -= sy;
|
||||
sy = sy >> 15;
|
||||
|
||||
sy = sy < 0 ? 0 : sy;
|
||||
fy = fy < 0 ? 0 : fy;
|
||||
|
||||
fy = (fy + (1<< 4)) >> 5;
|
||||
sx += (*xOffset);
|
||||
sy += (*yOffset);
|
||||
int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);
|
||||
int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);
|
||||
int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);
|
||||
|
||||
vxc_uchar16 Y, U, V;
|
||||
vxc_int4 C0, C1, C2, C3;
|
||||
vxc_uchar16 R, G, B;
|
||||
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.x + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.x + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.y;
|
||||
srcPos1.x = sx.y >> 1;
|
||||
srcPos2.x = sx.y >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.y + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.y + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.z;
|
||||
srcPos1.x = sx.z >> 1;
|
||||
srcPos2.x = sx.z >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.z + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.z + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
srcPos.x = sx.w;
|
||||
srcPos1.x = sx.w >> 1;
|
||||
srcPos2.x = sx.w >> 1;
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos1.x = (sx.w + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
|
||||
srcPos2.x = (sx.w + 1) >> 1;
|
||||
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
//C = Y - 16; D = U - 128; E = V - 128;
|
||||
// ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]
|
||||
int tmpV = -56992;
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);
|
||||
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
|
||||
|
||||
// ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
|
||||
// 298Y - 208V
|
||||
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);
|
||||
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);
|
||||
// 34784 - 100U
|
||||
ushort tmpG = 34784;
|
||||
vxc_ushort8 tmpDstG, tmpDstG1;
|
||||
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
|
||||
VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);
|
||||
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
|
||||
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
|
||||
VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
|
||||
VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
|
||||
|
||||
// ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
|
||||
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);
|
||||
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);
|
||||
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);
|
||||
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);
|
||||
tmpV = -70688;
|
||||
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
|
||||
|
||||
int4 result, temp1, temp2;
|
||||
int4 tmpData0, tmpData1;
|
||||
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
// temp2 - temp1
|
||||
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
|
||||
tmpV = 1 << 19;
|
||||
vxc_uchar8 dst;
|
||||
float4 tmpDst;
|
||||
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - bMean) * var;
|
||||
dstPos.z = bOrder;
|
||||
result = convert_int4_rte(tmpDst * outputScale + zp);
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - gMean) * var;
|
||||
dstPos.z = 1;
|
||||
result = convert_int4_rte(tmpDst * outputScale + zp);
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
|
||||
temp1 = fx * tmpData0 + tmpData1;
|
||||
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
|
||||
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
|
||||
temp2 = fx * tmpData0 + tmpData1;
|
||||
result = fy * temp2 + (temp1 << 10);
|
||||
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
|
||||
tmpDst = (tmpDst - rMean) * var;
|
||||
dstPos.z = rOrder;
|
||||
result = convert_int4_rte(tmpDst * outputScale + zp);
|
||||
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
|
||||
_viv_uniform float outputScaleVar;
|
||||
_viv_uniform float bMeanScaleVarZp;
|
||||
_viv_uniform float gMeanScaleVarZp;
|
||||
_viv_uniform float rMeanScaleVarZp;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertYUV422toB_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertYUV422toG_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
|
||||
|
||||
#define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
|
||||
__kernel void pre_process_yuv422_copy_##name \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
global int* xRatio, \
|
||||
global int* yRatio, \
|
||||
global int* xOffset, \
|
||||
global int* yOffset, \
|
||||
float rMean, \
|
||||
float gMean, \
|
||||
float bMean, \
|
||||
float var, \
|
||||
int reverse_channel, \
|
||||
int trans, \
|
||||
int yuv422_type \
|
||||
) \
|
||||
{ \
|
||||
int gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
\
|
||||
int sy = gidy + (*yOffset); \
|
||||
int sx = gidx + (*xOffset * 2); \
|
||||
\
|
||||
vxc_uchar8 YUV; \
|
||||
vxc_short8 tmpYUV; \
|
||||
\
|
||||
VXC_ReadImage(YUV, input, (int2)(sx,sy), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
if (yuv422_type == 1) \
|
||||
{ \
|
||||
YUV.s01234567 = YUV.s10325476; \
|
||||
} \
|
||||
\
|
||||
short tmpVal = 128; \
|
||||
VXC_DP2x8(tmpYUV, YUV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \
|
||||
\
|
||||
float4 tmpDstB, tmpDstG, tmpDstR; \
|
||||
VXC_DP4x4(tmpDstB, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \
|
||||
VXC_DP4x4(tmpDstG, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \
|
||||
VXC_DP4x4(tmpDstR, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \
|
||||
\
|
||||
conv_type result; \
|
||||
dst_type dst0; \
|
||||
save_type dst; \
|
||||
int4 dstPos = (int4)(gidx, gidy, 0, 0); \
|
||||
tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstB); \
|
||||
dstPos.z = bOrder; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstG); \
|
||||
dstPos.z = 1; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstR); \
|
||||
dstPos.z = rOrder; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
YUV422_COPY_SH_IMPL(U8toU8, vxc_uchar4, int4, vxc_uchar4, 4)
|
||||
YUV422_COPY_SH_IMPL(U8toI8, vxc_char4, int4, vxc_char4, 4)
|
||||
YUV422_COPY_SH_IMPL(U8toI16, vxc_short4, int4, vxc_short4, 8)
|
||||
YUV422_COPY_SH_IMPL(U8toF16, vxc_half4, half4, vxc_short4, 8)
|
||||
|
|
@ -0,0 +1,132 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform int bOrder;
|
||||
_viv_uniform int rOrder;
|
||||
|
||||
_viv_uniform float outputScaleVar;
|
||||
_viv_uniform float bMeanScaleVarZp;
|
||||
_viv_uniform float gMeanScaleVarZp;
|
||||
_viv_uniform float rMeanScaleVarZp;
|
||||
|
||||
_viv_uniform uint xrIntFloat_16;
|
||||
_viv_uniform uint yrIntFloat_16;
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvertYUV422toB_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertYUV422toG_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;
|
||||
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
|
||||
|
||||
#define uyvy422 1
|
||||
|
||||
#define YUV422_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
|
||||
__kernel void pre_process_yuv422_scale_##name \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
global int* xRatio, \
|
||||
global int* yRatio, \
|
||||
global int* xOffset, \
|
||||
global int* yOffset, \
|
||||
float rMean, \
|
||||
float gMean, \
|
||||
float bMean, \
|
||||
float var, \
|
||||
int reverse_channel, \
|
||||
int trans, \
|
||||
int yuv422_type \
|
||||
) \
|
||||
{ \
|
||||
int4 gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
gidx += (int4)(0, 1, 2, 3); \
|
||||
\
|
||||
uint dy = (convert_uint(gidy) * yrIntFloat_16) >> 16; \
|
||||
uint4 dx = (convert_uint4(gidx) * xrIntFloat_16) >> 16; \
|
||||
int sy = convert_int(dy) + (*yOffset); \
|
||||
int4 sx = convert_int4(dx)+ (*xOffset * 2); \
|
||||
\
|
||||
vxc_uchar4 Y; \
|
||||
vxc_uchar8 UV; \
|
||||
vxc_char8 tmpUV; \
|
||||
short tmpVal = 128; \
|
||||
int y_offset = 0; \
|
||||
int u_offset = 1; \
|
||||
int v_offset = 3; \
|
||||
\
|
||||
if (yuv422_type == uyvy422) \
|
||||
{ \
|
||||
y_offset = 1; \
|
||||
u_offset = 0; \
|
||||
v_offset = 2; \
|
||||
} \
|
||||
\
|
||||
int4 coord_Y = (int4)(sx.x * 2 + y_offset, sy, 0, 0); \
|
||||
int4 coord_U = (int4)((sx.x >> 1) * 4 + u_offset, sy, 0, 0); \
|
||||
int4 coord_V = (int4)((sx.x >> 1) * 4 + v_offset, sy, 0, 0); \
|
||||
\
|
||||
VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_Y.x = sx.y * 2 + y_offset; \
|
||||
VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_Y.x = sx.z * 2 + y_offset; \
|
||||
VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_Y.x = sx.w * 2 + y_offset; \
|
||||
VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
sx = (sx >> 1) * 4 + u_offset; \
|
||||
VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_U.x = sx.y; \
|
||||
VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_U.x = sx.z; \
|
||||
VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_U.x = sx.w; \
|
||||
VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
sx = sx - u_offset + v_offset; \
|
||||
VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_V.x = sx.y; \
|
||||
VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_V.x = sx.z; \
|
||||
VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_V.x = sx.w; \
|
||||
VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \
|
||||
vxc_uchar4 dst_test; \
|
||||
VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
|
||||
\
|
||||
float4 tmpDstB, tmpDstG, tmpDstR; \
|
||||
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \
|
||||
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \
|
||||
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \
|
||||
\
|
||||
conv_type result; \
|
||||
dst_type dst0; \
|
||||
save_type dst; \
|
||||
int4 dstPos = (int4)(gidx.x, gidy, 0, 0); \
|
||||
tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstB); \
|
||||
dstPos.z = bOrder; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstG); \
|
||||
dstPos.z = 1; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
|
||||
_viv_asm(CONV_RTE, result, tmpDstR); \
|
||||
dstPos.z = rOrder; \
|
||||
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, dst, dst0, copy_bytes); \
|
||||
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
|
||||
YUV422_SH_IMPL(U8toU8, vxc_uchar4, int4, vxc_uchar4, 4)
|
||||
YUV422_SH_IMPL(U8toI8, vxc_char4, int4, vxc_char4, 4)
|
||||
YUV422_SH_IMPL(U8toI16, vxc_short4, int4, vxc_short4, 8)
|
||||
YUV422_SH_IMPL(U8toF16, vxc_half4, half4, vxc_short4, 8)
|
||||
|
|
@ -12,15 +12,15 @@ _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
|
|||
vxc_ushort8 mp0, mp1; \
|
||||
_viv_asm(COPY, mp0, multAndoutZP0, 16); \
|
||||
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
|
||||
read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
read_fun(src0, input0, coord, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
read_fun(src1, input1, coord, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
|
||||
uniU8MulAndPostShift0_Lo_2x8); \
|
||||
VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
|
||||
uniU8MulAndPostShift1_Lo_2x8); \
|
||||
read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
read_fun(value_tmp, condition, coord, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_DP2x8(value, value_tmp, value_tmp,\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \
|
||||
|
|
@ -60,11 +60,11 @@ SELECT_INT_FUN_2D(I8, I16, I16, vxc_short8)
|
|||
#define SELECT_HALF(read_fun, write_fun) \
|
||||
vxc_short8 src0, src1, dst, value; \
|
||||
vxc_char8 value_tmp; \
|
||||
read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
read_fun(src0, input0, coord, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
read_fun(src1, input1, coord, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
read_fun(value_tmp, condition, coord, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_DP2x8(value, value_tmp, value_tmp,\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \
|
||||
|
|
@ -91,37 +91,36 @@ __kernel void select_I8_F16_F16toF16_2D(
|
|||
SELECT_HALF(VXC_ReadImage, VXC_WriteImage)
|
||||
}
|
||||
|
||||
#define SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, read_fun, write_fun) \
|
||||
vxc_short8 src0, src1, dst, value; \
|
||||
vxc_half8 value0, value1; \
|
||||
src0_type r0; \
|
||||
src1_type r1; \
|
||||
#define SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type, read_fun, write_fun) \
|
||||
save_type dst, value; \
|
||||
save_type dst0, dst1; \
|
||||
dst_type value0, value1; \
|
||||
src0_type src0; \
|
||||
src1_type src1; \
|
||||
copy0_type v0; \
|
||||
copy1_type v1; \
|
||||
vxc_char8 value_tmp; \
|
||||
vxc_ushort8 mp0, mp1; \
|
||||
_viv_asm(COPY, mp0, multAndoutZP0, 16); \
|
||||
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
|
||||
read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
read_fun(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, v0, src0, 16); \
|
||||
read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
read_fun(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, v1, src1, 16); \
|
||||
VXC_DP2x8(value0, v0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
|
||||
uniU8MulAndPostShift0_Lo_2x8); \
|
||||
_viv_asm(COPY, src0, value0, 16); \
|
||||
_viv_asm(COPY, dst0, value0, 16); \
|
||||
VXC_DP2x8(value1, v1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
|
||||
uniU8MulAndPostShift1_Lo_2x8); \
|
||||
_viv_asm(COPY, src1, value1, 16); \
|
||||
read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
_viv_asm(COPY, dst1, value1, 16); \
|
||||
read_fun(value_tmp, condition, coord, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_DP2x8(value, value_tmp, value_tmp,\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \
|
||||
dst = (value != 0 ? src0 : src1); \
|
||||
dst = (value != 0 ? dst0 : dst1); \
|
||||
write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
#define SELECT_HYBRID_TOF16_FUN(name, src0_type, copy0_type, src1_type, copy1_type) \
|
||||
#define SELECT_HYBRID_FUN(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type) \
|
||||
__kernel void select_##name( \
|
||||
__read_only image2d_array_t condition, \
|
||||
__read_only image2d_array_t input0, \
|
||||
|
|
@ -129,44 +128,62 @@ __kernel void select_##name( \
|
|||
__write_only image2d_array_t output) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \
|
||||
SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type,\
|
||||
VXC_ReadImage2DArray, VXC_WriteImage2DArray) \
|
||||
}
|
||||
SELECT_HYBRID_TOF16_FUN(I8_F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16)
|
||||
SELECT_HYBRID_TOF16_FUN(I8_U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8)
|
||||
SELECT_HYBRID_TOF16_FUN(I8_F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16)
|
||||
SELECT_HYBRID_TOF16_FUN(I8_I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8)
|
||||
SELECT_HYBRID_TOF16_FUN(I8_F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||
SELECT_HYBRID_TOF16_FUN(I8_I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8)
|
||||
SELECT_HYBRID_FUN(I8_F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN(I8_U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN(I8_F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN(I8_I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN(I8_F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN(I8_I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN(I8_F16_U8toU8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_uchar8, vxc_uchar8)
|
||||
SELECT_HYBRID_FUN(I8_U8_F16toU8, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8)
|
||||
SELECT_HYBRID_FUN(I8_F16_I8toI8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_char8, vxc_char8)
|
||||
SELECT_HYBRID_FUN(I8_I8_F16toI8, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_char8, vxc_char8)
|
||||
SELECT_HYBRID_FUN(I8_F16_I16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
|
||||
SELECT_HYBRID_FUN(I8_I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||
SELECT_HYBRID_FUN(I8_U8_U8toF16, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN(I8_I8_I8toF16, vxc_char8, vxc_char8, vxc_char8, vxc_char8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN(I8_I16_I16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8)
|
||||
|
||||
#define SELECT_HYBRID_TOF16_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type) \
|
||||
__kernel void select_##name( \
|
||||
#define SELECT_HYBRID_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type) \
|
||||
__kernel void select_##name##_2D( \
|
||||
__read_only image2d_array_t condition, \
|
||||
__read_only image2d_array_t input0, \
|
||||
__read_only image2d_array_t input1, \
|
||||
__write_only image2d_array_t output) \
|
||||
{ \
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
|
||||
SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \
|
||||
SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type, \
|
||||
VXC_ReadImage, VXC_WriteImage) \
|
||||
}
|
||||
SELECT_HYBRID_TOF16_FUN_2D(I8_F16_U8toF16_2D, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16)
|
||||
SELECT_HYBRID_TOF16_FUN_2D(I8_U8_F16toF16_2D, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8)
|
||||
SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I8toF16_2D, vxc_short8, vxc_half8, vxc_char16, vxc_char16)
|
||||
SELECT_HYBRID_TOF16_FUN_2D(I8_I8_F16toF16_2D, vxc_char16, vxc_char16, vxc_short8, vxc_half8)
|
||||
SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I16toF16_2D, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||
SELECT_HYBRID_TOF16_FUN_2D(I8_I16_F16toF16_2D, vxc_short8, vxc_short8, vxc_short8, vxc_half8)
|
||||
SELECT_HYBRID_FUN_2D(I8_F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN_2D(I8_U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN_2D(I8_F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN_2D(I8_I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN_2D(I8_F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN_2D(I8_I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN_2D(I8_F16_U8toU8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_uchar8, vxc_uchar8)
|
||||
SELECT_HYBRID_FUN_2D(I8_U8_F16toU8, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8)
|
||||
SELECT_HYBRID_FUN_2D(I8_F16_I8toI8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_char8, vxc_char8)
|
||||
SELECT_HYBRID_FUN_2D(I8_I8_F16toI8, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_char8, vxc_char8)
|
||||
SELECT_HYBRID_FUN_2D(I8_F16_I16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
|
||||
SELECT_HYBRID_FUN_2D(I8_I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||
SELECT_HYBRID_FUN_2D(I8_U8_U8toF16, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN_2D(I8_I8_I8toF16, vxc_char8, vxc_char8, vxc_char8, vxc_char8, vxc_half8, vxc_short8)
|
||||
SELECT_HYBRID_FUN_2D(I8_I16_I16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8)
|
||||
|
||||
#define SELECT_HALF_TO_QINT(read_fun, write_fun, dst_type) \
|
||||
vxc_short8 src0, src1, tmp_dst, value; \
|
||||
vxc_half8 data; \
|
||||
dst_type dst; \
|
||||
vxc_char8 value_tmp; \
|
||||
read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
read_fun(src0, input0, coord, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
read_fun(src1, input1, coord, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
read_fun(value_tmp, condition, coord, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_DP2x8(value, value_tmp, value_tmp,\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -478,7 +478,7 @@ vsi_status vsi_nn_ClientNodePassParameters
|
|||
)
|
||||
{
|
||||
vsi_status status;
|
||||
uint8_t i;
|
||||
uint32_t i;
|
||||
|
||||
status = VSI_FAILURE;
|
||||
for( i = 0; i < num; i++ )
|
||||
|
|
|
|||
|
|
@ -1,8 +1,207 @@
|
|||
# to make ovxlib can compile both IDE and SKD
|
||||
# if you want to use IDE to compile : export USE_IDE_LIB=1
|
||||
# and VIVANTE_SDK_DIR=..../VeriSilicon/VivanteIDE5.4.0/cmdtools/vsimulator
|
||||
|
||||
###################################################################################
|
||||
#common parts
|
||||
# OBJECTS.
|
||||
|
||||
OBJECTS = $(OBJ_DIR)/vsi_nn_context.o \
|
||||
$(OBJ_DIR)/vsi_nn_client_op.o \
|
||||
$(OBJ_DIR)/vsi_nn_graph.o \
|
||||
$(OBJ_DIR)/vsi_nn_node_attr_template.o \
|
||||
$(OBJ_DIR)/vsi_nn_node.o \
|
||||
$(OBJ_DIR)/vsi_nn_ops.o \
|
||||
$(OBJ_DIR)/vsi_nn_daemon.o \
|
||||
$(OBJ_DIR)/vsi_nn_tensor.o \
|
||||
$(OBJ_DIR)/vsi_nn_version.o \
|
||||
$(OBJ_DIR)/vsi_nn_rnn.o \
|
||||
$(OBJ_DIR)/vsi_nn_rnn_helper.o \
|
||||
$(OBJ_DIR)/vsi_nn_internal_node.o \
|
||||
$(OBJ_DIR)/vsi_nn_log.o \
|
||||
$(OBJ_DIR)/vsi_nn_graph_optimization.o \
|
||||
$(OBJ_DIR)/vsi_nn_pre_post_process.o
|
||||
|
||||
vpath %.c utils
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_code_generator.o \
|
||||
$(OBJ_DIR)/vsi_nn_binary_tree.o \
|
||||
$(OBJ_DIR)/vsi_nn_map.o \
|
||||
$(OBJ_DIR)/vsi_nn_link_list.o \
|
||||
$(OBJ_DIR)/vsi_nn_math.o \
|
||||
$(OBJ_DIR)/vsi_nn_dtype_util.o \
|
||||
$(OBJ_DIR)/vsi_nn_shape_util.o \
|
||||
$(OBJ_DIR)/vsi_nn_dtype.o \
|
||||
$(OBJ_DIR)/vsi_nn_limits.o \
|
||||
$(OBJ_DIR)/vsi_nn_vdata.o \
|
||||
$(OBJ_DIR)/vsi_nn_util.o \
|
||||
$(OBJ_DIR)/vsi_nn_dlfcn.o \
|
||||
$(OBJ_DIR)/vsi_nn_constraint_check.o \
|
||||
$(OBJ_DIR)/vsi_nn_hashmap.o \
|
||||
$(OBJ_DIR)/vsi_nn_tensor_op.o
|
||||
|
||||
vpath %.c quantization
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_dynamic_fixed_point.o \
|
||||
$(OBJ_DIR)/vsi_nn_asymmetric_affine.o \
|
||||
$(OBJ_DIR)/vsi_nn_perchannel_symmetric_affine.o
|
||||
|
||||
vpath %.c pycc
|
||||
OBJECTS += $(OBJ_DIR)/vsi_pycc_interface.o
|
||||
|
||||
vpath %.c post
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \
|
||||
$(OBJ_DIR)/vsi_nn_post_cmupose.o
|
||||
|
||||
vpath %.c libnnext
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o \
|
||||
$(OBJ_DIR)/vsi_nn_vxkernel.o
|
||||
|
||||
vpath %.c cpu_backend
|
||||
SRCS += ${notdir ${wildcard cpu_backend/*.c}}
|
||||
|
||||
vpath %.c libnnext/ops/kernel
|
||||
SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}}
|
||||
|
||||
vpath %.c ops
|
||||
SRCS += ${notdir ${wildcard ops/*.c}}
|
||||
|
||||
vpath %.c kernel
|
||||
SRCS += ${notdir ${wildcard kernel/*.c}}
|
||||
|
||||
vpath %.c kernel/cl
|
||||
SRCS += ${notdir ${wildcard kernel/cl/*.c}}
|
||||
|
||||
vpath %.c kernel/cpu
|
||||
SRCS += ${notdir ${wildcard kernel/cpu/*.c}}
|
||||
|
||||
vpath %.c kernel/evis
|
||||
SRCS += ${notdir ${wildcard kernel/evis/*.c}}
|
||||
|
||||
vpath %.c kernel/vx
|
||||
SRCS += ${notdir ${wildcard kernel/vx/*.c}}
|
||||
|
||||
vpath %.c kernel/sp
|
||||
SRCS += ${notdir ${wildcard kernel/sp/*.c}}
|
||||
|
||||
vpath %.c custom/ops
|
||||
SRCS += ${notdir ${wildcard custom/ops/*.c}}
|
||||
|
||||
vpath %.c custom/ops/kernel/evis
|
||||
SRCS += ${notdir ${wildcard custom/ops/kernel/evis/*.c}}
|
||||
|
||||
vpath %.c custom/ops/kernel/cl
|
||||
SRCS += ${notdir ${wildcard custom/ops/kernel/cl/*.c}}
|
||||
|
||||
vpath %.c custom/ops/kernel/cpu
|
||||
SRCS += ${notdir ${wildcard custom/ops/kernel/cpu/*.c}}
|
||||
|
||||
vpath %.c custom/ops/kernel/sp
|
||||
SRCS += ${notdir ${wildcard custom/ops/kernel/sp/*.c}}
|
||||
|
||||
OBJECTS += ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)}
|
||||
|
||||
ifeq ($(USE_VIP_DEVICE),1)
|
||||
vpath %.cpp vip
|
||||
OBJECTS += $(OBJ_DIR)/virtual_device.o
|
||||
endif
|
||||
|
||||
################################################################################
|
||||
ifeq ($(USE_IDE_LIB),1)
|
||||
# IDE.
|
||||
|
||||
CC=$(CROSS_COMPILE)gcc
|
||||
|
||||
INCLUDES=-I. -I$(VIVANTE_SDK_DIR)/include/ \
|
||||
-I$(VIVANTE_SDK_DIR)/include/CL \
|
||||
-I$(VIVANTE_SDK_DIR)/include/VX \
|
||||
-I../include/ops -I../include/utils -I../include/inference \
|
||||
-I../include/client -I../include -I../include/libnnext \
|
||||
-I../include/cpu_backend
|
||||
|
||||
ifeq (1,$(DEBUG))
|
||||
CFLAGS+=-g
|
||||
LFLAGS+=-g
|
||||
else
|
||||
CFLAGS+=-O3
|
||||
LFLAGS+=-O3
|
||||
endif
|
||||
CFLAGS += $(INCLUDES)
|
||||
CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror -Wno-strict-aliasing -Wno-maybe-uninitialized
|
||||
CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")))'
|
||||
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/lib \
|
||||
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy -lArchModelSw -lNNArchPerf
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/vsim \
|
||||
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux \
|
||||
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \
|
||||
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \
|
||||
-lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
|
||||
LIBS+= -L$(VIVANTE_SDK_DIR)/../common/lib/ \
|
||||
-lvdtproxy
|
||||
LIBS += -lm -ldl
|
||||
|
||||
File = $(VIVANTE_SDK_DIR)/lib/libjpeg.a
|
||||
File2 = $(VIVANTE_SDK_DIR)/lib/x64_linux/libjpeg.a
|
||||
File3 = $(VIVANTE_SDK_DIR)/../common/lib/libjpeg.a
|
||||
ifeq ($(File),$(wildcard $(File)))
|
||||
LIBS+= $(File)
|
||||
else ifeq ($(File2),$(wildcard $(File2)))
|
||||
LIBS+= $(File2)
|
||||
else
|
||||
LIBS+= $(File3)
|
||||
endif
|
||||
|
||||
###################################################################################
|
||||
# Macros.
|
||||
CFLAGS += -fPIC
|
||||
DYNAMIC := 1
|
||||
TARGET_NAME = libovxlib.so
|
||||
OBJ_DIR=bin_r
|
||||
TARGET_OUTPUT = $(OBJ_DIR)/$(TARGET_NAME)
|
||||
|
||||
all: $(TARGET_OUTPUT)
|
||||
clean:
|
||||
@rm -rf $(OBJ_DIR)/* $(OBJ_DIR)
|
||||
|
||||
install: $(TARGET_OUTPUT)
|
||||
|
||||
################################################################################
|
||||
|
||||
LDFLAGS += -Wall -shared -Wl,-soname,$(TARGET_NAME) -Wl,-z,defs -fPIC
|
||||
|
||||
ifeq ($(USE_VIP_DEVICE),1)
|
||||
LDFLAGS += -pthread
|
||||
LIBS += -lstdc++
|
||||
INCLUDE += -I../include/vip
|
||||
$(OBJ_DIR)/virtual_device.o: virtual_device.cpp
|
||||
@echo " COMPILE $(abspath $<)"
|
||||
@mkdir -p $(OBJ_DIR)
|
||||
@$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $<
|
||||
endif
|
||||
|
||||
$(TARGET_OUTPUT): $(OBJECTS)
|
||||
@echo " LINK \033[1m$(notdir $@)\033[0m"
|
||||
@$(CC) $(LDFLAGS) $(OBJECTS) -o $(TARGET_OUTPUT) $(LIBS)
|
||||
|
||||
$(OBJ_DIR)/%.o: %.c
|
||||
@echo " COMPILE $(abspath $<)"
|
||||
@mkdir -p $(OBJ_DIR)
|
||||
@$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
else
|
||||
##################################################################################
|
||||
#SDK.
|
||||
|
||||
# include common definition.
|
||||
include $(AQROOT)/makefile.linux.def
|
||||
|
||||
#################################################################################
|
||||
INCLUDE += -I$(VIVANTE_SDK_INC) -I$(VIVANTE_SDK_INC)/HAL -I$(AQROOT)/sdk/inc
|
||||
INCLUDE += -I../include/ops -I../include/utils -I../include/inference
|
||||
INCLUDE += -I../include/client -I../include -I../include/libnnext
|
||||
INCLUDE += -I../include/cpu_backend
|
||||
|
||||
CFLAGS += $(INCLUDE)
|
||||
CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror
|
||||
|
|
@ -43,89 +242,6 @@ ifneq ($(gcdSTATIC_LINK), 1)
|
|||
endif
|
||||
endif
|
||||
#############################################################################
|
||||
# Objects.
|
||||
OBJECTS = $(OBJ_DIR)/vsi_nn_context.o \
|
||||
$(OBJ_DIR)/vsi_nn_client_op.o \
|
||||
$(OBJ_DIR)/vsi_nn_graph.o \
|
||||
$(OBJ_DIR)/vsi_nn_node_attr_template.o \
|
||||
$(OBJ_DIR)/vsi_nn_node.o \
|
||||
$(OBJ_DIR)/vsi_nn_ops.o \
|
||||
$(OBJ_DIR)/vsi_nn_daemon.o \
|
||||
$(OBJ_DIR)/vsi_nn_tensor.o \
|
||||
$(OBJ_DIR)/vsi_nn_version.o \
|
||||
$(OBJ_DIR)/vsi_nn_rnn.o \
|
||||
$(OBJ_DIR)/vsi_nn_rnn_helper.o \
|
||||
$(OBJ_DIR)/vsi_nn_internal_node.o \
|
||||
$(OBJ_DIR)/vsi_nn_log.o \
|
||||
$(OBJ_DIR)/vsi_nn_graph_optimization.o \
|
||||
$(OBJ_DIR)/vsi_nn_pre_post_process.o
|
||||
|
||||
vpath %.c utils
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_code_generator.o \
|
||||
$(OBJ_DIR)/vsi_nn_binary_tree.o \
|
||||
$(OBJ_DIR)/vsi_nn_map.o \
|
||||
$(OBJ_DIR)/vsi_nn_link_list.o \
|
||||
$(OBJ_DIR)/vsi_nn_math.o \
|
||||
$(OBJ_DIR)/vsi_nn_dtype_util.o \
|
||||
$(OBJ_DIR)/vsi_nn_shape_util.o \
|
||||
$(OBJ_DIR)/vsi_nn_dtype.o \
|
||||
$(OBJ_DIR)/vsi_nn_limits.o \
|
||||
$(OBJ_DIR)/vsi_nn_vdata.o \
|
||||
$(OBJ_DIR)/vsi_nn_util.o \
|
||||
$(OBJ_DIR)/vsi_nn_constraint_check.o \
|
||||
$(OBJ_DIR)/vsi_nn_hashmap.o \
|
||||
$(OBJ_DIR)/vsi_nn_tensor_op.o
|
||||
|
||||
vpath %.c quantization
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_dynamic_fixed_point.o \
|
||||
$(OBJ_DIR)/vsi_nn_asymmetric_affine.o \
|
||||
$(OBJ_DIR)/vsi_nn_perchannel_symmetric_affine.o
|
||||
|
||||
vpath %.c pycc
|
||||
OBJECTS += $(OBJ_DIR)/vsi_pycc_interface.o
|
||||
|
||||
vpath %.c post
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \
|
||||
$(OBJ_DIR)/vsi_nn_post_cmupose.o
|
||||
|
||||
vpath %.c libnnext
|
||||
OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o \
|
||||
$(OBJ_DIR)/vsi_nn_vxkernel.o
|
||||
|
||||
vpath %.c libnnext/ops/kernel
|
||||
SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}}
|
||||
|
||||
vpath %.c ops
|
||||
SRCS += ${notdir ${wildcard ops/*.c}}
|
||||
|
||||
vpath %.c kernel
|
||||
SRCS += ${notdir ${wildcard kernel/*.c}}
|
||||
|
||||
vpath %.c kernel/cl
|
||||
SRCS += ${notdir ${wildcard kernel/cl/*.c}}
|
||||
|
||||
vpath %.c kernel/cpu
|
||||
SRCS += ${notdir ${wildcard kernel/cpu/*.c}}
|
||||
|
||||
vpath %.c kernel/evis
|
||||
SRCS += ${notdir ${wildcard kernel/evis/*.c}}
|
||||
|
||||
vpath %.c kernel/vx
|
||||
SRCS += ${notdir ${wildcard kernel/vx/*.c}}
|
||||
|
||||
vpath %.c custom/ops
|
||||
SRCS += ${notdir ${wildcard custom/ops/*.c}}
|
||||
|
||||
vpath %.c custom/ops/kernel/evis
|
||||
SRCS += ${notdir ${wildcard custom/ops/kernel/evis/*.c}}
|
||||
|
||||
vpath %.c custom/ops/kernel/cl
|
||||
SRCS += ${notdir ${wildcard custom/ops/kernel/cl/*.c}}
|
||||
|
||||
vpath %.c custom/ops/kernel/cpu
|
||||
SRCS += ${notdir ${wildcard custom/ops/kernel/cpu/*.c}}
|
||||
|
||||
OBJECTS += ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)}
|
||||
|
||||
# installation directory
|
||||
INSTALL_DIR := $(VIVANTE_SDK_LIB)
|
||||
|
|
@ -133,4 +249,15 @@ INSTALL_DIR := $(VIVANTE_SDK_LIB)
|
|||
################################################################################
|
||||
# Include the common makefile.
|
||||
|
||||
ifeq ($(USE_VIP_DEVICE),1)
|
||||
LDFLAGS += -pthread
|
||||
LIBS += -lstdc++
|
||||
INCLUDE += -I../include/vip
|
||||
$(OBJ_DIR)/virtual_device.o: virtual_device.cpp
|
||||
@echo " COMPILE $(abspath $<)"
|
||||
@mkdir -p $(OBJ_DIR)
|
||||
@$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $<
|
||||
endif
|
||||
|
||||
include $(AQROOT)/common.target
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@
|
|||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "utils/vsi_nn_math.h"
|
||||
#include "vsi_nn_test.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_constraint_check.h"
|
||||
|
||||
static vsi_status op_compute
|
||||
|
|
@ -48,9 +48,39 @@ static vsi_status op_compute
|
|||
vx_nn_reorg_params_ext_t param;
|
||||
vsi_nn_tensor_t *block_size_tensor = NULL;
|
||||
vsi_nn_tensor_t *pad_tensor = NULL;
|
||||
vsi_nn_tensor_t *input_tensor = NULL;
|
||||
vsi_nn_tensor_t *output_tensor = NULL;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
memset(¶m, 0, sizeof(vx_nn_reorg_params_ext_t));
|
||||
int32_t block_size[2] = {1, 1};
|
||||
vsi_bool need_release_tensor = TRUE;
|
||||
|
||||
block_size[0] = self->nn_param.batch2space.block_size[0];
|
||||
if (vsi_nn_is_3d_tensor(inputs[0]))
|
||||
{
|
||||
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
memcpy(shape[0], inputs[0]->attr.size, sizeof(shape[0]));
|
||||
memcpy(shape[1], outputs[0]->attr.size, sizeof(shape[1]));
|
||||
shape[0][3] = shape[0][2];
|
||||
shape[0][2] = shape[0][1];
|
||||
shape[0][1] = 1;
|
||||
shape[1][3] = shape[1][2];
|
||||
shape[1][2] = shape[1][1];
|
||||
shape[1][1] = 1;
|
||||
|
||||
input_tensor = vsi_nn_reshape_tensor(self->graph, inputs[0], shape[0], 4);
|
||||
CHECK_PTR_FAIL_GOTO( input_tensor, "craete tensor fail.", final );
|
||||
output_tensor = vsi_nn_reshape_tensor(self->graph, outputs[0], shape[1], 4);
|
||||
CHECK_PTR_FAIL_GOTO( output_tensor, "craete tensor fail.", final );
|
||||
}
|
||||
else
|
||||
{
|
||||
block_size[1] = self->nn_param.batch2space.block_size[1];
|
||||
need_release_tensor = FALSE;
|
||||
input_tensor = inputs[0];
|
||||
output_tensor = outputs[0];
|
||||
}
|
||||
|
||||
memset(¶m, 0, sizeof(vx_nn_reorg_params_ext_t));
|
||||
memset(&attr, 0, sizeof(attr));
|
||||
attr.size[0] = 2;
|
||||
attr.dim_num = 1;
|
||||
|
|
@ -59,9 +89,9 @@ static vsi_status op_compute
|
|||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||
block_size_tensor = vsi_nn_CreateTensorFromData(
|
||||
self->graph,
|
||||
(uint8_t *)self->nn_param.batch2space.block_size,
|
||||
(uint8_t *)block_size,
|
||||
&attr);
|
||||
TEST_CHECK_PTR(block_size_tensor, final);
|
||||
CHECK_PTR_FAIL_GOTO( block_size_tensor, "craete tensor fail.", final );
|
||||
|
||||
memset(&attr, 0, sizeof(attr));
|
||||
attr.size[0] = 4;
|
||||
|
|
@ -73,16 +103,16 @@ static vsi_status op_compute
|
|||
self->graph,
|
||||
(uint8_t *)self->nn_param.batch2space.crop,
|
||||
&attr);
|
||||
TEST_CHECK_PTR(pad_tensor, final);
|
||||
CHECK_PTR_FAIL_GOTO( pad_tensor, "craete tensor fail.", final );
|
||||
|
||||
param.base.block_size = REQUIRED_IO(block_size_tensor);
|
||||
param.pad = OPTIONAL_IO(pad_tensor);
|
||||
param.base.type = VX_REORG_BATCH_TO_SPACE_ND;
|
||||
self->n = vxReorgLayer2( self->graph->g,
|
||||
inputs[0]->t,
|
||||
input_tensor->t,
|
||||
(vx_nn_reorg_params_t *)¶m,
|
||||
sizeof(vx_nn_reorg_params_ext_t),
|
||||
outputs[0]->t);
|
||||
output_tensor->t);
|
||||
|
||||
if( NULL != self->n )
|
||||
{
|
||||
|
|
@ -90,8 +120,13 @@ static vsi_status op_compute
|
|||
}
|
||||
|
||||
final:
|
||||
if (block_size_tensor) vsi_nn_ReleaseTensor(&block_size_tensor);
|
||||
if (pad_tensor) vsi_nn_ReleaseTensor(&pad_tensor);
|
||||
if (need_release_tensor)
|
||||
{
|
||||
vsi_safe_release_tensor(input_tensor);
|
||||
vsi_safe_release_tensor(output_tensor);
|
||||
}
|
||||
vsi_safe_release_tensor(block_size_tensor);
|
||||
vsi_safe_release_tensor(pad_tensor);
|
||||
|
||||
return status;
|
||||
} /* op_compute() */
|
||||
|
|
@ -105,14 +140,13 @@ static vsi_bool op_check
|
|||
{
|
||||
vsi_bool ret = FALSE;
|
||||
|
||||
if (inputs[0]->attr.dim_num != 4)
|
||||
if (inputs[0]->attr.dim_num < 3)
|
||||
{
|
||||
VSILOGE("batch2space only support 4D");
|
||||
VSILOGE("The input tensor shape must be 3D or 4D!");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (self->nn_param.batch2space.block_size[0] < 0
|
||||
|| self->nn_param.batch2space.block_size[1] < 0)
|
||||
if (self->nn_param.batch2space.block_size[0] < 0)
|
||||
{
|
||||
VSILOGE("Block size can't be less than zero in batch to space");
|
||||
return FALSE;
|
||||
|
|
@ -131,18 +165,33 @@ static vsi_bool op_setup
|
|||
)
|
||||
{
|
||||
vsi_nn_batch2space_param * p;
|
||||
|
||||
p = (vsi_nn_batch2space_param *)&(self->nn_param.batch2space);
|
||||
|
||||
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
||||
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
||||
{
|
||||
outputs[0]->attr.size[3] =
|
||||
inputs[0]->attr.size[3] / p->block_size[0] / p->block_size[1];
|
||||
outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
|
||||
outputs[0]->attr.size[1] =
|
||||
inputs[0]->attr.size[1] * p->block_size[1] - p->crop[2] - p->crop[3];
|
||||
outputs[0]->attr.size[0] =
|
||||
inputs[0]->attr.size[0] * p->block_size[0] - p->crop[0] - p->crop[1];
|
||||
outputs[0]->attr.dim_num = 4;
|
||||
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
|
||||
|
||||
if (vsi_nn_is_3d_tensor(inputs[0]))
|
||||
{
|
||||
outputs[0]->attr.size[2] =
|
||||
inputs[0]->attr.size[2] / p->block_size[0];
|
||||
outputs[0]->attr.size[1] = inputs[0]->attr.size[1];
|
||||
outputs[0]->attr.size[0] =
|
||||
inputs[0]->attr.size[0] * p->block_size[0] - p->crop[0] - p->crop[1];
|
||||
}
|
||||
else
|
||||
{
|
||||
outputs[0]->attr.size[3] =
|
||||
inputs[0]->attr.size[3] / p->block_size[0] / p->block_size[1];
|
||||
outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
|
||||
outputs[0]->attr.size[1] =
|
||||
inputs[0]->attr.size[1] * p->block_size[1] - p->crop[2] - p->crop[3];
|
||||
outputs[0]->attr.size[0] =
|
||||
inputs[0]->attr.size[0] * p->block_size[0] - p->crop[0] - p->crop[1];
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue