Update internal ovxlib to rel/1.2.2 (#674)

Update to SHA:806fcd6a69d333e62508acf0a6aa2c38c8385eae

Type: Code Improvement

Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
Chen Feiyue 2024-01-03 13:13:15 +08:00 committed by GitHub
parent cf099e3849
commit 2d9e614a06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
203 changed files with 18939 additions and 5096 deletions

View File

@ -3,6 +3,9 @@
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# Some header file
include/vsi_nn_feature_config.h
# User-specific files
*.suo
*.user

View File

@ -195,3 +195,5 @@ DEF_OP(GRID_SAMPLE)
DEF_OP(LPNORM)
DEF_OP(RESIZE_3D)
DEF_OP(REDUCEL2)
DEF_OP(CROP_AND_RESIZE)
DEF_OP(TAN)

View File

@ -55,6 +55,7 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
VSI_NN_KERNEL_LUT_ATANH = 21,
VSI_NN_KERNEL_LUT_ACOSH = 22,
VSI_NN_KERNEL_LUT_INVERSE_SIGMOID = 23,
VSI_NN_KERNEL_LUT_TAN = 24,
};

View File

@ -106,10 +106,21 @@ enum
BI_LSTM_BW_INPUT_LAYERNORM_C = 54,
BI_LSTM_BW_INPUT_LAYERNORM_O = 55,
BI_LSTM_FW_INPUT_BIAS_R2I = 56,
BI_LSTM_FW_INPUT_BIAS_R2F = 57,
BI_LSTM_FW_INPUT_BIAS_R2C = 58,
BI_LSTM_FW_INPUT_BIAS_R2O = 59,
BI_LSTM_BW_INPUT_BIAS_R2I = 60,
BI_LSTM_BW_INPUT_BIAS_R2F = 61,
BI_LSTM_BW_INPUT_BIAS_R2C = 62,
BI_LSTM_BW_INPUT_BIAS_R2O = 63,
BI_LSTM_INPUT_CNT,
BI_LSTM_FW_OUTPUT_OUTPUT = 0,
BI_LSTM_BW_OUTPUT_OUTPUT = 1,
BI_LSTM_OUTPUT_CNT
};

View File

@ -0,0 +1,47 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_CROP_AND_RESIZE_H
#define _VSI_NN_OP_CROP_AND_RESIZE_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_crop_and_resize_param
{
struct _crop_and_resize_local_data_t * lcl_data;
const int32_t* crop_size;
vsi_enum resize_method;
float extrapolation_value;
} vsi_nn_crop_and_resize_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -70,6 +70,11 @@ enum
LSTM_INPUT_AUX_WEIGHT_I2C = 27,
LSTM_INPUT_AUX_WEIGHT_I2O = 28,
LSTM_INPUT_BIAS_R2I = 29,
LSTM_INPUT_BIAS_R2F = 30,
LSTM_INPUT_BIAS_R2C = 31,
LSTM_INPUT_BIAS_R2O = 32,
LSTM_INPUT_CNT,
LSTM_OUTPUT_OUTPUT = 0,

View File

@ -74,6 +74,11 @@ enum
LSTMUNIT_INPUT_AUX_WEIGHT_I2C = 27,
LSTMUNIT_INPUT_AUX_WEIGHT_I2O = 28,
LSTMUNIT_INPUT_BIAS_R2I = 29,
LSTMUNIT_INPUT_BIAS_R2F = 30,
LSTMUNIT_INPUT_BIAS_R2C = 31,
LSTMUNIT_INPUT_BIAS_R2O = 32,
LSTMUNIT_INPUT_CNT,
LSTMUNIT_OUTPUT_OUTPUT = 0,

View File

@ -38,7 +38,8 @@ typedef uint32_t vsi_nn_interpolation_type_t; enum
{
VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR = 0,
VSI_NN_INTERPOLATION_BILINEAR,
VSI_NN_INTERPOLATION_AREA
VSI_NN_INTERPOLATION_AREA,
VSI_NN_INTERPOLATION_CUBIC
};
typedef uint32_t vsi_nn_resize_layout_type_t; enum

View File

@ -33,6 +33,7 @@ extern "C" {
typedef struct _vsi_nn_scatter_nd_update_param
{
vsi_bool use_locking;
vsi_nn_reduction_type_e reduction;
} vsi_nn_scatter_nd_update_param;
#ifdef __cplusplus

View File

@ -471,6 +471,12 @@ char* vsi_nn_getenv
const char * var_name
);
int32_t vsi_nn_getenv_asint
(
const char* env,
int32_t default_value
);
FILE* vsi_nn_fopen
(
const char * file_name,

View File

@ -43,6 +43,7 @@ class IDevice {
OVXLIB_API IDevice(uint32_t id);
OVXLIB_API ~IDevice();
OVXLIB_API uint32_t Id() const;
OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, bool (*func)(const void*), data_t data);
OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data);
OVXLIB_API bool GraphRemove(const vsi_nn_graph_t* graph);
OVXLIB_API bool ThreadExit();

View File

@ -79,6 +79,8 @@ typedef struct _vsi_nn_runtime_option_t
int32_t enable_dataconvert_optimize;
int32_t enable_stream_processor;
int32_t enable_rgb88_planar_nhwc;
int32_t enable_slice_optimize;
int32_t enable_batch_opt;
} vsi_nn_runtime_option_t;
/**

View File

@ -1,3 +1,26 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the Software),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
/*****Auto generated header file, Please DO NOT modify manually!*****/
#ifndef _VSI_NN_FEATURE_CONFIG_H
#define _VSI_NN_FEATURE_CONFIG_H
@ -20,5 +43,15 @@
#define VSI_CONCAT_ENHANCE_SUPPORT
#endif
#define VSI_CREATE_TENSOR_FROM_VIEW_SUPPORT
#ifndef VSI_SWAP_HANDLE_CACHE_SUPPORT
#define VSI_SWAP_HANDLE_CACHE_SUPPORT
#endif
#define VSI_EXPORT_APIS_FOR_SETUP_GRAPH 1
#if defined(VX_SET_TENSOR_MEMPOOL_TYPE_SUPPORT) && VX_SET_TENSOR_MEMPOOL_TYPE_SUPPORT
#define VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT
#endif
#if defined(VX_13_NN_COMPATIBLITY)
#define VSI_MAP_TENSOR_PATCH_SUPPORT
#endif
#endif

View File

@ -382,6 +382,31 @@ OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromView
vsi_size_t* end
);
/**
* Add a new tensor from AXI-SRAM
* Create a new tensor from internal AXI-SRAM and add it to graph.
* It just creates the tensor object and does not actually allocate the memory
* in AXI-SRAM until the verify graph stage. In the other words, the tensor object is
* created beforehand,but the memory for storing its data is not allocate until verify
* graph stage. AXI-SRAM is the internal memory resource that memory allocation is done
* strategically to optimize performance and resource usage in graph verification.
* If there is no enough memory in AXI-SRAM, vsi_nn_VerifyGraph will return VSI_FAILURE
* User can't access the tensor memory(read/write tensor data) before the graph has verified,
* since the tensor memory is not allocated.
* @param[in] graph Graph handle
* @param[in] id Optional id to the tensor, set it to VSI_NN_TENSOR_ID_AUTO,
* and a new id will be generated.
* @param[in] attr Tensor attirbutes to the new tensor.
*
* @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otheriwse.
*/
OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromAXISRAM
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_id_t id,
vsi_nn_tensor_attr_t * attr
);
/**
* Attach tensor to graph
* Attach an exist tensor to graph.
@ -796,6 +821,18 @@ OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption
size_t size
);
/**
* graph shape inference
*
* @param[in] graph Graph handle
*
* @return VSI_SUCCESS on success, or appropriate error code otherwise
* */
OVXLIB_API vsi_status vsi_nn_InferShape
(
vsi_nn_graph_t* graph
);
#ifdef __cplusplus
}
#endif

View File

@ -155,6 +155,22 @@ OVXLIB_API void vsi_nn_PrintNode
vsi_nn_node_id_t id
);
#if VX_GRAPH_BATCH_OPT_SUPPORT
/**
* Set how much this node is divided into on batch dim.
*
* @param[in] node Node.
* @param[in] split_num.
*
* @return VSI_SUCCESS on success, or error core otherwise.
*/
OVXLIB_API vsi_status vsi_nn_SetNodeBatchSplitNum
(
vsi_nn_node_t* node,
int8_t split_num
);
#endif
/**
* Update node attribute
* Update openvx node attribute based on ovxlib's node attribute

View File

@ -209,6 +209,7 @@
#include "ops/vsi_nn_op_lpnorm.h"
#include "ops/vsi_nn_op_resize_3d.h"
#include "ops/vsi_nn_op_reducel2.h"
#include "ops/vsi_nn_op_crop_and_resize.h"
/* custom node head define define */
#include "custom/vsi_nn_custom_node_type.h"
#include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -406,6 +407,7 @@ typedef union _vsi_nn_nn_param
vsi_nn_lpnorm_param lpnorm;
vsi_nn_resize_3d_param resize_3d;
vsi_nn_reducel2_param reducel2;
vsi_nn_crop_and_resize_param crop_and_resize;
void* client_param;
/* custom node data struct define */

View File

@ -35,6 +35,9 @@
#if defined(VX_KHR_COMPATIBILITY) && (0x1==VX_KHR_COMPATIBILITY)
#include <VX/vx_khr_compatible.h>
#endif
#ifdef VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT
#include <VX/vx_viv_sys.h>
#endif
/*
This is a compatibility head file for backward compatibility OpenVX 1.1 spec

View File

@ -89,6 +89,8 @@ typedef enum
VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422,
VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422,
VSI_NN_SOURCE_FORMAT_IMAGE_NV21,
VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB,
VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR,
} vsi_nn_preprocess_source_format_e;
/**

View File

@ -54,5 +54,10 @@
#include "utils/vsi_nn_dtype_util.h"
#include "quantization/vsi_nn_asymmetric_affine.h"
#include "quantization/vsi_nn_dynamic_fixed_point.h"
#if defined(VSI_ENABLE_LCOV_TEST) && VSI_ENABLE_LCOV_TEST
#include "lcov/vsi_nn_coverage.h"
#endif
#endif

View File

@ -817,6 +817,82 @@ vsi_nn_tensor_t * vsi_nn_dropout_tensor
float rate
);
/**
* Allows the application to get direct access to a patch of tensor object.
* A wrapper api for OpenVX vxMapTensorPatch
*
* @param[in] graph Graph handle.
* @param[in] tensor Tensor handle.
* @param[out] ptr The address of a pointer that the function sets to the
* address where the requested data can be accessed. The returned (*ptr) address
* is only valid between the call to the function and the corresponding call to
* vsi_nn_UnmapTensorPatch.
* @param [in] usage This declares the access mode for the tensor patch, using
* the vsi_nn_accessor_type_e enumeration.
* VSI_NN_READ_ONLY: after the function call, the content of the memory location
* pointed by (*ptr) contains the tensor patch data. Writing into this memory location
* is forbidden and its behavior is undefined.
* VSI_NN_READ_AND_WRITE : after the function call, the content of the memory
* location pointed by (*ptr) contains the tensor patch data; writing into this memory
* is allowed only for the location of items and will result in a modification of the
* affected items in the tensor object once the range is unmapped. Writing into
* a gap between items (when (*stride) > item size in bytes) is forbidden and its
* behavior is undefined.
* VSI_NN_WRITE_ONLY: after the function call, the memory location pointed by (*ptr)
* contains undefined data; writing each item of the range is required prior to
* unmapping. Items not written by the application before unmap will become
* undefined after unmap, even if they were well defined before map. Like for
* VSI_NN_READ_AND_WRITE, writing into a gap between items is forbidden and its behavior
* is undefined.
* @return VSI_SUCCESS on success, or error core otherwise.
*/
OVXLIB_API vsi_status vsi_nn_MapTensorPatch
(
vsi_nn_graph_t* graph,
vsi_nn_tensor_t* tensor,
void** ptr,
vsi_nn_accessor_type_e usage
);
/**
* Unmap and commit potential changes to a tensor object patch that was previously mapped.
* Unmapping a tensor patch invalidates the memory location from which the patch could
* be accessed by the application. Accessing this memory location after the unmap function
* completes has an undefined behavior.
* @param[in] graph Graph handle.
* @param [in] tensor The reference to the tensor object to unmap.
* return VSI_SUCCESS on success, or error core otherwise.
*/
OVXLIB_API vsi_status vsi_nn_UnmapTensorPatch
(
vsi_nn_graph_t* graph,
vsi_nn_tensor_t* tensor
);
/**
* Create a new tensor from internal AXI-SRAM(Kernel driver maped)
* It just creates the tensor object and does not actually allocate the memory
* in AXI-SRAM until the verify graph stage. In the other words, the tensor
* object is created beforehand,but the memory for storing its data is not
* allocate until verify graph stage. AXI-SRAM is the internal memory resource
* that memory allocation is done strategically to optimize performance and
* resource usage in graph verification.
* If there is no enough memory in AXI-SRAM, vsi_nn_VerifyGraph will return VSI_FAILURE
* User can't access the tensor memory(read/write tensor data) before the graph has verified,
* since the tensor memory is not allocated.
* @param[in] graph Graph handle
* @param[in] attr Tensor attirbutes to the new tensor.
*
* @return Tensor handle on success, or NULL otherwise.
*/
OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateTensorFromAXISRAM
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_attr_t * attr
);
#ifdef __cplusplus
}
#endif

View File

@ -115,7 +115,9 @@ typedef enum
{
VSI_NN_REDUCTION_TYPE_NONE,
VSI_NN_REDUCTION_TYPE_ADD,
VSI_NN_REDUCTION_TYPE_MUL
VSI_NN_REDUCTION_TYPE_MUL,
VSI_NN_REDUCTION_TYPE_MAX,
VSI_NN_REDUCTION_TYPE_MIN
} vsi_nn_reduction_type_e;
/** Pad mode enum */
@ -269,7 +271,9 @@ typedef enum _vsi_nn_yuv_type
typedef enum _vsi_nn_nv_type
{
VSI_NN_YUV_TYPE_NV12,
VSI_NN_YUV_TYPE_NV21
VSI_NN_YUV_TYPE_NV21,
VSI_NN_YUV_TYPE_NV12_RGGB,
VSI_NN_YUV_TYPE_NV21_BGGR
}vsi_nn_nv_type;
typedef enum _vsi_nn_roi_align_type_e
@ -283,6 +287,12 @@ typedef enum _vsi_nn_custom_warp_affine_type_e {
VSI_NN_WARP_AFFINE_TYPE_RGB
} vsi_nn_custom_warp_affine_type_e;
typedef enum _vsi_nn_accessor_type_e {
VSI_NN_READ_ONLY = VX_READ_ONLY,
VSI_NN_WRITE_ONLY = VX_WRITE_ONLY,
VSI_NN_READ_AND_WRITE = VX_READ_AND_WRITE
} vsi_nn_accessor_type_e;
/** Deprecated */
typedef uint32_t vsi_nn_size_t;

View File

@ -32,8 +32,8 @@ extern "C"{
#endif
#define VSI_NN_VERSION_MAJOR 1
#define VSI_NN_VERSION_MINOR 1
#define VSI_NN_VERSION_PATCH 88
#define VSI_NN_VERSION_MINOR 2
#define VSI_NN_VERSION_PATCH 2
#define VSI_NN_VERSION \
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

View File

@ -14,6 +14,10 @@ ifeq ($(PLATFORM_VENDOR),1)
LOCAL_VENDOR_MODULE := true
endif
$(info Remove $(LOCAL_PATH)/../include/vsi_nn_feature_config.h ...)
$(shell rm $(LOCAL_PATH)/../include/vsi_nn_feature_config.h -rf)
$(info $(shell bash $(LOCAL_PATH)/../gcc_gen_feature_config_header.sh $(LOCAL_PATH)/..))
LOCAL_SRC_FILES := \
vsi_nn_context.c \
vsi_nn_client_op.c \
@ -59,12 +63,6 @@ LOCAL_SRC_FILES += \
post/vsi_nn_post_fasterrcnn.c \
post/vsi_nn_post_cmupose.c
LOCAL_SRC_FILES += \
cpu_backend/vsi_nn_cpu_backend.c \
cpu_backend/vsi_nn_cpu_backend_conv2d.c \
cpu_backend/vsi_nn_cpu_backend_deconv2d.c \
cpu_backend/npuref_interface.c
LOCAL_SRC_FILES += libnnext/vsi_nn_libnnext_resource.c \
libnnext/vsi_nn_vxkernel.c
@ -78,11 +76,10 @@ LOCAL_SRC_FILES += kernel/vsi_nn_kernel.c \
kernel/vsi_nn_kernel_param.c \
kernel/vsi_nn_kernel_gpu_shape_optimize.c \
kernel/vsi_nn_kernel_lut.c \
kernel/vsi_nn_spinst.c \
kernel/vsi_nn_sp_unit_operation.c \
kernel/vsi_nn_sp_lut.c \
kernel/vsi_nn_gpu.c
LOCAL_SRC_FILES += vip/virtual_device.cpp
LIBNNEXT_KERNEL_SOURCES := $(wildcard $(LOCAL_PATH)/libnnext/ops/kernel/*.c)
LOCAL_SRC_FILES += $(LIBNNEXT_KERNEL_SOURCES:$(LOCAL_PATH)/%=%)
@ -117,13 +114,14 @@ LOCAL_C_INCLUDES += \
$(AQROOT)/sdk/inc/ \
$(AQROOT)/sdk/inc/HAL \
$(LOCAL_PATH)/../include \
$(LOCAL_PATH)/../include/vip \
$(LOCAL_PATH)/../include/ops \
$(LOCAL_PATH)/../include/utils \
$(LOCAL_PATH)/../include/infernce \
$(LOCAL_PATH)/../include/client \
$(LOCAL_PATH)/../include/cpu_backend \
$(LOCAL_PATH)/../include/libnnext \
$(LOCAL_PATH)/../src
$(LOCAL_PATH)/../src \
$(LOCAL_PATH)/../src/vip
LOCAL_CFLAGS := \
-DLINUX \

View File

@ -22,6 +22,7 @@
*
*****************************************************************************/
#if !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -293,6 +294,16 @@ static vsi_status _query_kernel
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && input0_dtype == I16)
{
input0_dtype = I32;
}
if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && input1_dtype == I16)
{
input1_dtype = I32;
}
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && output_dtype == I8)
{
output_dtype = BOOL8;
@ -452,3 +463,4 @@ final:
REGISTER_BACKEND_CL( relational_ops, _setup )
__END_DECLS
#endif

View File

@ -0,0 +1,359 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
typedef enum _crop_and_resize_type_e
{
nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
bilinear = VSI_NN_INTERPOLATION_BILINEAR,
}crop_and_resize_type_e;
#define _CROP_AND_RESIZE_KERNEL_SOURCE_NAME "crop_and_resize_"
// Add kernel hashtable here
#define CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
(( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8) | (RESIZE_METHOD))
#define CROP_AND_RESIZE_KERNEL( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
{ CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ), \
CVIVANTE_NAMESPACE("cl.crop_and_resize_"#RESIZE_METHOD"_"#IN_DTYPE"to"#OUT_DTYPE), \
_CROP_AND_RESIZE_KERNEL_SOURCE_NAME#RESIZE_METHOD }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _crop_and_resize_kernel_map[] =
{
// Register kernel here
CROP_AND_RESIZE_KERNEL( U32, U32, nearest_neighbor ),
CROP_AND_RESIZE_KERNEL( U32, F32, nearest_neighbor ),
CROP_AND_RESIZE_KERNEL( F32, F32, nearest_neighbor),
CROP_AND_RESIZE_KERNEL( F32, U32, nearest_neighbor ),
CROP_AND_RESIZE_KERNEL( F32, I32, nearest_neighbor),
CROP_AND_RESIZE_KERNEL( I32, I32, nearest_neighbor ),
CROP_AND_RESIZE_KERNEL( I32, F32, nearest_neighbor),
CROP_AND_RESIZE_KERNEL( U32, U32, bilinear),
CROP_AND_RESIZE_KERNEL( U32, F32, bilinear),
CROP_AND_RESIZE_KERNEL( F32, F32, bilinear),
CROP_AND_RESIZE_KERNEL( F32, U32, bilinear),
CROP_AND_RESIZE_KERNEL( F32, I32, bilinear),
CROP_AND_RESIZE_KERNEL( I32, I32, bilinear),
CROP_AND_RESIZE_KERNEL( I32, F32, bilinear),
};
/*
* Kernel params
*/
static vx_param_description_t _crop_and_resize_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _CROP_AND_RESIZE_PARAM_NUM _cnt_of_array( _crop_and_resize_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_crop_and_resize_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
};
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
int32_t crop_width = 0;
int32_t crop_height = 0;
int32_t image_width = 0;
int32_t image_height = 0;
int32_t batch_out = 0;
float width_scale = 0;
float height_scale = 0;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
CHECK_STATUS_FAIL_GOTO(status, final );
image_width = (int32_t)(attr[0]->shape->data[0]);
image_height = (int32_t)(attr[0]->shape->data[1]);
crop_width = (int32_t)(attr[1]->shape->data[0]);
crop_height = (int32_t)(attr[1]->shape->data[1]);
width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = (crop_width + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0];
gpu_param.global_size[1] = (crop_height + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1];
gpu_param.global_size[2] = (batch_out + gpu_param.global_scale[2] - 1)
/ gpu_param.global_scale[2];
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
status = vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
CHECK_STATUS_FAIL_GOTO(status, final);
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _crop_and_resize_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t resize_method
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _crop_and_resize_kernel_map;
size_t kernel_map_size = _cnt_of_array( _crop_and_resize_kernel_map );
vx_param_description_t * param_def = _crop_and_resize_kernel_param_def;
vx_kernel_initialize_f initializer = _crop_and_resize_initializer;
uint32_t key;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (F16 == in_dtype)
{
in_dtype = F32;
}
else if (U8 == in_dtype)
{
in_dtype = U32;
}
else if (I8 == in_dtype || I16 == in_dtype)
{
in_dtype = I32;
}
if (F16 == out_dtype)
{
out_dtype = F32;
}
else if (U8 == out_dtype)
{
out_dtype = U32;
}
else if (I8 == out_dtype || I16 == out_dtype)
{
out_dtype = I32;
}
key = CROP_AND_RESIZE_HASH_KEY( in_dtype, out_dtype, resize_method );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _crop_and_resize_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CROP_AND_RESIZE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
uint32_t ori_depth = (uint32_t)inputs[0]->attr.size[2];
uint32_t ori_batchout = (uint32_t)outputs[0]->attr.size[3];
float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
float inOutScale = input_scale / output_scale;
float inOutTile = output_zp - inOutScale * input_zp;
float extrapolation_value = vsi_nn_kernel_param_get_float32( params, "extrapolation_value" );
int32_t resize_method = vsi_nn_kernel_param_get_int32( params, "resize_method" );
VSI_UNREFERENCED(params);
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
shapes[0][0] = inputs[0]->attr.size[0];
shapes[0][1] = inputs[0]->attr.size[1];
shapes[0][2] = inputs[0]->attr.size[2] * inputs[0]->attr.size[3];
shapes[1][0] = outputs[0]->attr.size[0];
shapes[1][1] = outputs[0]->attr.size[1];
shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
if (rs_input == NULL || rs_output == NULL)
{
goto final;
}
status = _query_kernel( kernel, inputs, outputs, resize_method );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
node_params[0] = rs_input;
node_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t);
node_params[2] = (vsi_nn_kernel_node_param_t)(inputs[2]->t);
node_params[3] = rs_output;
node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &ori_depth );
node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &ori_batchout );
node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &inOutScale );
node_params[7] = vsi_nn_kernel_scalar_create( graph, F32, &inOutTile );
node_params[8] = vsi_nn_kernel_scalar_create( graph, F32, &extrapolation_value );
status = vsi_nn_kernel_node_pass_param( node, node_params, _CROP_AND_RESIZE_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
}
}
final:
if (rs_input)
{
vsi_nn_kernel_tensor_release( &rs_input );
}
if (rs_output)
{
vsi_nn_kernel_tensor_release( &rs_output );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( crop_and_resize, _setup )

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -228,4 +228,4 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CL( depth2space_internal, _setup )
#endif

View File

@ -1,300 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
typedef enum
{
INTERNAL_KERNEL_DETECT_POST_BOX,
} _internal_kernel_e;
#define _DETECT_POST_BOX_KERNEL_SOURCE "detect_post_box"
#define STR(a) #a
// Add kernel hashtable here
#define DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
((IN0_DTYPE << 18) | ( IN1_DTYPE << 11 ) | ( OUT_DTYPE << 4))
#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
{ DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
CVIVANTE_NAMESPACE("cl.detect_post_box_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
_DETECT_POST_BOX_KERNEL_SOURCE}
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _detect_post_box_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( F32, F32, F32 ),
PACK_KERNEL_MAP( U8, U8, F32 ),
};
/*
* Kernel params
*/
static vx_param_description_t _detect_post_box_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _DETECT_POST_BOX_PARAM_NUM _cnt_of_array( _detect_post_box_kernel_param_def )
#define _DETECT_POST_BOX_F32_PARAM_NUM 8
#define SCALAR_SCALE_Y (3)
#define SCALAR_SCALE_X (4)
#define SCALAR_SCALE_H (5)
#define SCALAR_SCALE_W (6)
#define SCALAR_LOG_E (7)
#define SCALAR_TAIL0 (8)
#define SCALAR_TAIL1 (9)
#define SCALAR_SCALE0 (10)
#define SCALAR_SCALE1 (11)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_size_array_t * in_shape = NULL;
VSI_UNREFERENCED(param_size);
VSI_UNREFERENCED(node);
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
in_shape = input_attr->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.dim = 2;
gpu_param.global_size[0] = (
(in_shape->data[1] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0]);
gpu_param.global_size[1] = (
(in_shape->data[2] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(input_attr);
return status;
} /* _detect_post_box_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_bool *is_use_u8_kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in0_dtype;
vsi_nn_kernel_dtype_e in1_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _detect_post_box_kernel_map;
size_t kernel_map_size = _cnt_of_array( _detect_post_box_kernel_map );
vx_param_description_t * param_def = _detect_post_box_kernel_param_def;
size_t param_def_size = _cnt_of_array( _detect_post_box_kernel_param_def );
vx_kernel_initialize_f initializer = _detect_post_box_initializer;
uint32_t key;
uint32_t i;
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if ((U8 == in0_dtype) && (U8 == in1_dtype))
{
*is_use_u8_kernel = TRUE;
param_def_size = _DETECT_POST_BOX_PARAM_NUM;
}
else
{
*is_use_u8_kernel = FALSE;
param_def_size = _DETECT_POST_BOX_F32_PARAM_NUM;
}
key = DETECT_POST_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype );
for ( i = 0; i < kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (vx_uint32)param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_DETECT_POST_BOX_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
float logE = (float)(log10(exp(1.0f)) / log10(2.0f));
float inv_scale_y = vsi_nn_kernel_param_get_float32( params, "inv_scale_y" );
float inv_scale_x = vsi_nn_kernel_param_get_float32( params, "inv_scale_x" );
float inv_scale_h = vsi_nn_kernel_param_get_float32( params, "inv_scale_h" );
float inv_scale_w = vsi_nn_kernel_param_get_float32( params, "inv_scale_w" );
vsi_bool is_use_u8_kernel = FALSE;
float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
float input0Zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float input0Tail = -input0Zp * input0Scale;
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
float input1Zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
float input1Tail = -input1Zp * input1Scale;
status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel );
if ( VSI_SUCCESS == status )
{
size_t node_params_num = _DETECT_POST_BOX_F32_PARAM_NUM;
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _DETECT_POST_BOX_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_y );
node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_x );
node_params[SCALAR_SCALE_H] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_h );
node_params[SCALAR_SCALE_W] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_w );
node_params[SCALAR_LOG_E] = vsi_nn_kernel_scalar_create( graph, F32, &logE );
if (is_use_u8_kernel)
{
node_params[SCALAR_TAIL0] = vsi_nn_kernel_scalar_create( graph, F32, &input0Tail );
node_params[SCALAR_TAIL1] = vsi_nn_kernel_scalar_create( graph, F32, &input1Tail );
node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
node_params_num = _DETECT_POST_BOX_PARAM_NUM;
}
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
VSI_ASSERT( status == VSI_SUCCESS );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_H] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_W] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_LOG_E] );
if (is_use_u8_kernel)
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL0] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL1] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE0] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE1] );
}
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( detect_post_box, _setup )

View File

@ -1,197 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
#if 0
/*
* Define kernel meta.
*/
typedef enum
{
INTERNAL_KERNEL_DETECT_POST_NMS,
} _internal_kernel_e;
#define _DETECT_POST_NMS_KERNEL_SOURCE "detect_post_nms"
#define _DETECT_POST_NMS_KERNEL_NAME CVIVANTE_NAMESPACE("cl.detect_post_nms")
// Add kernel hashtable here
#define DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
(( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \
{ DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ), _DETECT_POST_NMS_KERNEL_NAME, SOURCE }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _detect_post_nms_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( F32, F32, _DETECT_POST_NMS_KERNEL_SOURCE ),
};
/*
* Kernel params
*/
static vx_param_description_t _detect_post_nms_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _DETECT_POST_NMS_PARAM_NUM _cnt_of_array( _detect_post_nms_kernel_param_def )
#define SCALAR_NMS_TYPE (6)
#define SCALAR_MAX_NUM (7)
#define SCALAR_MAX_CLASS (8)
#define SCALAR_MAX_DETECT (9)
#define SCALAR_SCORE_TH (10)
#define SCALAR_IOU_TH (11)
#define SCALAR_IS_BG (12)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_detect_post_nms_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
return status;
} /* _detect_post_nms_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _detect_post_nms_kernel_map;
size_t kernel_map_size = _cnt_of_array( _detect_post_nms_kernel_map );
vx_param_description_t * param_def = _detect_post_nms_kernel_param_def;
size_t param_def_size = _cnt_of_array( _detect_post_nms_kernel_param_def );
vx_kernel_initialize_f initializer = _detect_post_nms_initializer;
uint32_t key;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = DETECT_POST_NMS_HASH_KEY( in_dtype, out_dtype );
for ( i = 0; i < kernel_map_size; i++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
#endif
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_nn_kernel_node_t node = NULL;
VSI_UNREFERENCED(graph);
VSI_UNREFERENCED(inputs);
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(outputs);
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(params);
VSI_UNREFERENCED(kernel);
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( detect_post_nms, _setup )

View File

@ -60,6 +60,7 @@ typedef enum
UNARY_ATANH,
UNARY_ACOSH,
UNARY_INVERSE_SIGMOID,
UNARY_TAN,
} unary_type_e;
/*
@ -108,6 +109,7 @@ typedef enum
#define ATANH_OPERATION atanh
#define ACOSH_OPERATION acosh
#define INVERSE_SIGMOID_OPERATION inverse_sigmoid
#define TAN_OPERATION tan
#define ADD_UNARY_SH_KERNELS(name) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F32, F32) \
@ -142,6 +144,7 @@ static const struct {
ADD_UNARY_SH_KERNELS(ATANH)
ADD_UNARY_SH_KERNELS(ACOSH)
ADD_UNARY_SH_KERNELS(INVERSE_SIGMOID)
ADD_UNARY_SH_KERNELS(TAN)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32, I32)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32, I32)
@ -166,6 +169,7 @@ static const struct {
#undef ATANH_OPERATION
#undef ACOSH_OPERATION
#undef INVERSE_SIGMOID_OPERATION
#undef TAN_OPERATION
/*
* Kernel params
*/
@ -452,16 +456,22 @@ OnError:
REGISTER_BACKEND_CL( KERNEL_NAME, _##KERNEL_NAME##_setup )
#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
REGISTER_ELTWISE_UNARY_BACKEND_CL( sin, UNARY_SIN )
REGISTER_ELTWISE_UNARY_BACKEND_CL( cos, UNARY_COS )
#endif
#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
REGISTER_ELTWISE_UNARY_BACKEND_CL( exp, UNARY_EXP )
#endif
REGISTER_ELTWISE_UNARY_BACKEND_CL( log, UNARY_LOG )
REGISTER_ELTWISE_UNARY_BACKEND_CL( neg, UNARY_NEG )
REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID )
REGISTER_ELTWISE_UNARY_BACKEND_CL( mish, UNARY_MISH )
REGISTER_ELTWISE_UNARY_BACKEND_CL( round, UNARY_ROUND )
#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu, UNARY_GELU )
REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu, UNARY_HGELU )
#endif
REGISTER_ELTWISE_UNARY_BACKEND_CL( selu, UNARY_SELU )
REGISTER_ELTWISE_UNARY_BACKEND_CL( celu, UNARY_CELU )
REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp, UNARY_RCP )
@ -471,5 +481,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( atan, UNARY_ATAN )
REGISTER_ELTWISE_UNARY_BACKEND_CL( atanh, UNARY_ATANH )
REGISTER_ELTWISE_UNARY_BACKEND_CL( acosh, UNARY_ACOSH )
REGISTER_ELTWISE_UNARY_BACKEND_CL( inverse_sigmoid, UNARY_INVERSE_SIGMOID )
REGISTER_ELTWISE_UNARY_BACKEND_CL( tan, UNARY_TAN )
__END_DECLS

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_GATHER_API_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -420,3 +420,4 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CL( gather, _setup )
#endif

View File

@ -90,6 +90,8 @@ static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM _cnt_of_array( _grucell_activation_z_h_kernel_param_def )
@ -97,6 +99,8 @@ static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
#define SCALAR_INPUT_TAIL (8)
#define SCALAR_OUTPUT_SCALE (9)
#define SCALAR_OUTPUT_ZP (10)
#define SCALAR_OUTPUT1_SCALE (11)
#define SCALAR_OUTPUT1_ZP (12)
/*
* Kernel initializer
*/
@ -244,6 +248,8 @@ static vsi_nn_kernel_node_t _setup
float input_tail = -(float)vsi_nn_get_tensor_zero_point(inputs[GRUCELL_ACT_Z_H_HSTATE]) * input_scale;
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]);
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]);
float output_scale1 = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]);
float output_zp1 = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]);
if( activation != VSI_NN_ACT_TANH )
{
@ -268,11 +274,17 @@ static vsi_nn_kernel_node_t _setup
graph, F32, &output_scale );
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &output_zp );
node_params[SCALAR_OUTPUT1_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &output_scale1 );
node_params[SCALAR_OUTPUT1_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &output_zp1 );
status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT1_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT1_ZP] );
}
}
return node;

View File

@ -46,6 +46,7 @@ typedef enum _grucell_nn_activation_type_e
{
SIGMOID = VSI_NN_ACT_SIGMOID,
HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
RELU = VSI_NN_ACT_RELU,
}grucell_nn_activation_type_e;
#define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE "grucell_reset_after_activation"
@ -71,6 +72,9 @@ static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
PACK_KERNEL_MAP( U8, F32, U8, SIGMOID ),
PACK_KERNEL_MAP( I32, F32, I32, SIGMOID ),
PACK_KERNEL_MAP( F32, F32, F32, SIGMOID ),
PACK_KERNEL_MAP( U8, F32, U8, RELU ),
PACK_KERNEL_MAP( I32, F32, I32, RELU ),
PACK_KERNEL_MAP( F32, F32, F32, RELU ),
};

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -360,3 +360,4 @@ final:
__END_DECLS
REGISTER_BACKEND_CL( layer_norm, _setup )
#endif

View File

@ -22,6 +22,7 @@
*
*****************************************************************************/
#if !(VX_LOGSOFTMAX_VX_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -34,6 +35,7 @@
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
@ -41,27 +43,30 @@ __BEGIN_DECLS
/*
* Define kernel meta.
*/
#define HASH_LOG_SOFTMAX_KEY(_axis, _input_type, _output_type, _image_2d) \
((_axis << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d))
#define HASH_LOG_SOFTMAX_KEY(_axis, _input_type, _output_type, _image_2d, exceed_limit) \
((_axis << 24) | (_input_type << 16) | (_output_type << 8) | (_image_2d << 4) | exceed_limit)
#define VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_axis) \
"log_softmax_axis"#_axis
#define VSI_NN_GEN_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_axis) \
"log_softmax_exceed_axis"#_axis
#define HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE)
#define TENSOR_LOG_SOFTMAX_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
#define TENSOR_LOG_SOFTMAX_FLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, F32, F32), \
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
#define TENSOR_LOG_SOFTMAX_BFLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
@ -69,20 +74,28 @@ __BEGIN_DECLS
CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D")
#define TENSOR_LOG_SOFTMAX_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
#define TENSOR_LOG_SOFTMAX_FLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, F32, F32), \
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
#define TENSOR_LOG_SOFTMAX_BFLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
#define HASH_LOG_SOFTMAX_EXCEED_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.log_softmax_exceed_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE)
#define TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 1), \
HASH_LOG_SOFTMAX_EXCEED_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
VSI_NN_GEN_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(AXIS) },
static const struct {
uint32_t key;
char* function_name;
@ -92,31 +105,31 @@ static const struct {
TENSOR_LOG_SOFTMAX_FLOAT(0, F32, F32)
TENSOR_LOG_SOFTMAX_FLOAT(1, F32, F32)
TENSOR_LOG_SOFTMAX_FLOAT(2, F32, F32)
TENSOR_LOG_SOFTMAX_FLOAT(0, F16, F16)
TENSOR_LOG_SOFTMAX_FLOAT(1, F16, F16)
TENSOR_LOG_SOFTMAX_FLOAT(2, F16, F16)
TENSOR_LOG_SOFTMAX_BFLOAT(0, BF16, BF16)
TENSOR_LOG_SOFTMAX_BFLOAT(1, BF16, BF16)
TENSOR_LOG_SOFTMAX_BFLOAT(2, BF16, BF16)
TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F32, F32)
TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F32, F32)
TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F16, F16)
TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F16, F16)
TENSOR_LOG_SOFTMAX_BFLOAT_2D(0, BF16, BF16)
TENSOR_LOG_SOFTMAX_BFLOAT_2D(1, BF16, BF16)
TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8)
TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8)
TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8)
TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8)
TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8)
TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8)
TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8, U8)
TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8, U8)
TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8, U8)
TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8, U8)
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, U8, U8)
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, U8, U8)
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, F32, F32)
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, F32, F32)
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, BF16)
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, BF16)
};
/*
@ -198,12 +211,89 @@ final:
return status;
} /* _log_softmax_initializer() */
DEF_KERNEL_INITIALIZER(_log_softmax_exceed_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
2, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0} // globalWorkSize: image size in thread
};
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL;
int32_t axis = 0;
int32_t width = 0;
int32_t height = 0;
int32_t depth = 0;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
CHECK_STATUS_FAIL_GOTO(status, final );
out_shape = attr[1]->shape;
width = (int32_t)(out_shape->data[0]);
height = (int32_t)(out_shape->data[1]);
depth = attr[1]->shape->size > 2 ? (int32_t)(out_shape->data[2]) : 1;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
if (axis == 0)
{
gpu_param.global_size[0] = 1;
gpu_param.global_size[1] = depth;
}
else
{
gpu_param.global_size[0] = width;
gpu_param.global_size[1] = 1;
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
if (axis == 0)
{
status |= vsi_nn_kernel_gpu_add_param( node, "width", &width );
}
else
{
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth );
}
status |= vsi_nn_kernel_gpu_add_param( node, "height", &height );
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
}
return status;
}
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
int32_t axis,
vsi_bool image_2d,
vsi_bool exceed_limit,
vsi_nn_kernel_t* kernel
)
{
@ -215,7 +305,17 @@ static vsi_status _query_kernel
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_LOG_SOFTMAX_KEY( axis, input_dtype, output_dtype, image_2d );
if (input_dtype == F16)
{
input_dtype = F32;
}
if (output_dtype == F16)
{
output_dtype = F32;
}
if (exceed_limit) image_2d = vx_false_e;
key = HASH_LOG_SOFTMAX_KEY( axis, input_dtype, output_dtype, image_2d, exceed_limit );
for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
{
@ -229,7 +329,14 @@ static vsi_status _query_kernel
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = kernel_param_def;
kernel->info.numParams = _cnt_of_array( kernel_param_def );
kernel->info.initialize = _log_softmax_initializer;
if (exceed_limit)
{
kernel->info.initialize = _log_softmax_exceed_initializer;
}
else
{
kernel->info.initialize = _log_softmax_initializer;
}
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
kernel_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
@ -254,7 +361,14 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
vsi_bool image_2d = FALSE;
vsi_nn_kernel_node_t node = NULL;
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
uint32_t rank_in = 0;
int32_t axis = 0;
int32_t new_axis = 0;
vsi_bool ret = vx_false_e;
vsi_bool exceed_limit = vx_false_e;
uint32_t i = 0;
float beta = 0;
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
float outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
@ -270,16 +384,37 @@ static vsi_nn_kernel_node_t _setup
scaleValue = scaleValue * beta * inputScale;
beta = beta * inputScale;
if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
inputs[0]->attr.dim_num )
|| axis > 2)
if (inputs[0]->attr.size[axis] >= GPU_TENSOR_MAX_WIDTH)
{
exceed_limit = vx_true_e;
}
ret = vsi_nn_kernel_optimize_softmax_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
shapes[0], &rank_in, &new_axis);
if (ret)
{
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], rank_in );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[0], rank_in );
}
else
{
return NULL;
}
image_2d = ((inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1)
&& axis != 2);
status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
reshape_tensors[0]->attr.dim_num )
|| new_axis > 2 || (new_axis == 2 && exceed_limit))
{
return NULL;
}
image_2d = ((reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1)
&& new_axis != 2);
status = _query_kernel( inputs, outputs, new_axis, image_2d, exceed_limit, kernel );
if( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
@ -287,10 +422,10 @@ static vsi_nn_kernel_node_t _setup
if( node )
{
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
inputs, 1, outputs, 1 );
reshape_tensors, 1, &reshape_tensors[1], 1 );
node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
graph, I32, &axis );
graph, I32, &new_axis );
node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create(
graph, F32, &beta );
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
@ -311,9 +446,16 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
}
}
for (i = 0; i < 2; i++)
{
vsi_safe_release_tensor(reshape_tensors[i]);
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( log_softmax, _setup )
#endif

View File

@ -75,6 +75,9 @@ __BEGIN_DECLS
#define HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
CVIVANTE_NAMESPACE("cl.gemm_4x_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
#define HASH_MATRIXMUL_4X_TRANSA_LOCAL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
CVIVANTE_NAMESPACE("cl.gemm_4x_transa_local_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
#define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0, 0), \
HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
@ -90,6 +93,11 @@ __BEGIN_DECLS
HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
SOURCE },
#define TENSOR_MATRIXMUL_4X_TRANSA_LOCAL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
{HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2, 1, 0), \
HASH_MATRIXMUL_4X_TRANSA_LOCAL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
SOURCE },
#define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
{HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 1, 0), \
HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
@ -142,6 +150,7 @@ static const struct {
TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_3)
TENSOR_MATRIXMUL_4X_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4)
TENSOR_MATRIXMUL_4X_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4)
TENSOR_MATRIXMUL_4X_TRANSA_LOCAL_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4)
};
/*
@ -313,6 +322,49 @@ final:
return status;
} /* _matrixmul_4x_initializer() */
DEF_KERNEL_INITIALIZER(_matrixmul_4x_local_initializer)
(vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t* param,
size_t param_size) {
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
vsi_nn_kernel_tensor_attr_t* attr = NULL;
vsi_size_t width = 0;
VSI_UNREFERENCED(param_size);
attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
CHECK_PTR_FAIL_GOTO(attr, "Create tensor attr buffer fail.", final);
width = attr->shape->data[0];
gpu_param.dim = 2;
gpu_param.local_size[0] = 1;
gpu_param.local_size[1] = 64;
gpu_param.local_size[2] = 1;
gpu_param.global_scale[0] = 16;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] =
(width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];
gpu_param.global_size[1] = 64;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
CHECK_STATUS_FAIL_GOTO(status, final);
final:
if (attr) {
vsi_nn_kernel_tensor_attr_release(&attr);
attr = NULL;
}
return status;
} /* _matrixmul_4x_local_initializer() */
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
@ -403,7 +455,10 @@ static vsi_status _query_kernel
kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def );
}
if (flag_4x) {
if ((flag_4x == 2) && (transa == 1)) {
kernel->info.initialize = _matrixmul_4x_local_initializer;
}
else if (flag_4x == 1) {
kernel->info.initialize = _matrixmul_4x_initializer;
} else {
kernel->info.initialize = _matrixmul_initializer;
@ -471,6 +526,7 @@ static vsi_nn_kernel_node_t _setup
uint32_t stride_axis_in_out[9] = {0};
vsi_nn_tensor_t* tmp_inputs[2] = {NULL};
vsi_nn_tensor_t* tmp_outputs[1] = {NULL};
vsi_bool shader_cnt_support = FALSE;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
@ -585,7 +641,20 @@ static vsi_nn_kernel_node_t _setup
rs_out_tensors = vsi_nn_reshape_tensor(graph, tmp_outputs[0], final_shape, final_rank);
final_out_tensors[0] = rs_out_tensors;
flag_4x = 1;
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
shader_cnt_support =
(graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE;
#endif
if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
{
flag_4x = 2;
}
else
{
flag_4x = 1;
}
}
}

View File

@ -246,28 +246,49 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
vsi_bool image_2d = FALSE;
vsi_nn_kernel_node_t node = NULL;
float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0_scale;
float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
float input1_tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1_scale;
float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
vsi_size_t new_rank = 0;
vsi_bool ret = TRUE;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(params);
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
output_scale = vsi_abs(output_scale) < 1e-5 ? 0.0f : 1.0f / output_scale;
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
ret = vsi_nn_kernel_optimize_eltwise_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
inputs[1]->attr.size, inputs[1]->attr.dim_num,
outputs[0]->attr.size, outputs[0]->attr.dim_num,
shapes[0], shapes[1], shapes[2], &new_rank );
if (ret == FALSE)
{
return NULL;
goto final;
}
image_2d = (outputs[0]->attr.dim_num == 2);
status = _query_kernel( inputs, outputs, image_2d, kernel );
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], new_rank );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
inputs[1], shapes[1], new_rank );
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[2], new_rank );
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
reshape_tensors[2]->attr.dim_num ) )
{
goto final;
}
image_2d = (reshape_tensors[2]->attr.dim_num == 2);
status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
@ -275,19 +296,19 @@ static vsi_nn_kernel_node_t _setup
if ( node )
{
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
inputs, 2, outputs, 1 );
reshape_tensors, 2, &reshape_tensors[2], 1 );
node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &input0Scale );
graph, F32, &input0_scale );
node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(
graph, F32, &input0Tail );
graph, F32, &input0_tail );
node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &input1Scale );
graph, F32, &input1_scale );
node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(
graph, F32, &input1Tail );
graph, F32, &input1_tail );
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &outputScale );
graph, F32, &output_scale );
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &outputZP );
graph, F32, &output_zp );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
@ -300,6 +321,12 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
}
}
final:
vsi_safe_release_tensor(reshape_tensors[0]);
vsi_safe_release_tensor(reshape_tensors[1]);
vsi_safe_release_tensor(reshape_tensors[2]);
return node;
} /* _setup() */

View File

@ -246,29 +246,49 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
vsi_bool image_2d = FALSE;
vsi_nn_kernel_node_t node = NULL;
float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0_scale;
float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
float input1_tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1_scale;
float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
vsi_size_t new_rank = 0;
vsi_bool ret = TRUE;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(params);
output_scale = vsi_abs(output_scale) < 1e-5 ? 0.0f : 1.0f / output_scale;
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
ret = vsi_nn_kernel_optimize_eltwise_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
inputs[1]->attr.size, inputs[1]->attr.dim_num,
outputs[0]->attr.size, outputs[0]->attr.dim_num,
shapes[0], shapes[1], shapes[2], &new_rank );
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
if (ret == FALSE)
{
return NULL;
goto final;
}
image_2d = (outputs[0]->attr.dim_num == 2);
status = _query_kernel( inputs, outputs, image_2d, kernel );
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], new_rank );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
inputs[1], shapes[1], new_rank );
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[2], new_rank );
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
reshape_tensors[2]->attr.dim_num ) )
{
goto final;
}
image_2d = (reshape_tensors[2]->attr.dim_num == 2);
status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
@ -276,19 +296,19 @@ static vsi_nn_kernel_node_t _setup
if ( node )
{
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
inputs, 2, outputs, 1 );
reshape_tensors, 2, &reshape_tensors[2], 1 );
node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &input0Scale );
graph, F32, &input0_scale );
node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(
graph, F32, &input0Tail );
graph, F32, &input0_tail );
node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &input1Scale );
graph, F32, &input1_scale );
node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(
graph, F32, &input1Tail );
graph, F32, &input1_tail );
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &outputScale );
graph, F32, &output_scale );
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &outputZP );
graph, F32, &output_zp );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
@ -301,6 +321,12 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
}
}
final:
vsi_safe_release_tensor(reshape_tensors[0]);
vsi_safe_release_tensor(reshape_tensors[1]);
vsi_safe_release_tensor(reshape_tensors[2]);
return node;
} /* _setup() */

View File

@ -22,6 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_POW_API_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -294,4 +295,4 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CL( pow, _setup )
#endif

View File

@ -0,0 +1,320 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
#define _RESIZE_CUBIC_KERNEL_SOURCE() "resize_cubic"
#define STR(a) #a
// Add kernel hashtable here
#define RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
(( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) )
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
CVIVANTE_NAMESPACE("cl.resize_cubic_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
_RESIZE_CUBIC_KERNEL_SOURCE() }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _resize_cubic_kernel_map[] =
{
PACK_KERNEL_MAP( F32, F32),
PACK_KERNEL_MAP( U8, U8),
};
/*
* Kernel params
*/
static vx_param_description_t _resize_cubic_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define SCALAR_SCALE_X (2)
#define SCALAR_SCALE_Y (3)
#define SCALAR_HALF_PIXEL (4)
#define SCALAR_INPUT_SCALE (5)
#define SCALAR_INPUT_TAIL (6)
#define SCALAR_OUTPUT_SCALE (7)
#define SCALAR_OUTPUT_TAIL (8)
#define RESIZE_CUBIC_NUM 5
#define RESIZE_CUBIC_QUANT_NUM _cnt_of_array( _resize_cubic_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_resize_cubic_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
VSI_UNREFERENCED(param_size);
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
out_shape = output_attr->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
gpu_param.global_size[0] = gpu_align_p2(
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(output_attr);
return status;
} /* _resize_cubic_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_bool *is_use_u8_kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _resize_cubic_kernel_map;
size_t kernel_map_size = _cnt_of_array( _resize_cubic_kernel_map );
vx_param_description_t * param_def = _resize_cubic_kernel_param_def;
size_t param_def_size = _cnt_of_array( _resize_cubic_kernel_param_def );
vx_kernel_initialize_f initializer = _resize_cubic_initializer;
uint32_t key;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (F16 == in_dtype)
{
in_dtype = F32;
}
if (F16 == out_dtype)
{
out_dtype = F32;
}
if ((U8 == in_dtype) || (U8 == out_dtype))
{
param_def_size = RESIZE_CUBIC_QUANT_NUM;
*is_use_u8_kernel = TRUE;
}
else
{
param_def_size = RESIZE_CUBIC_NUM;
*is_use_u8_kernel = FALSE;
}
key = RESIZE_CUBIC_HASH_KEY( in_dtype, out_dtype );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (uint32_t)param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[RESIZE_CUBIC_QUANT_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
vsi_size_t in_width = inputs[0]->attr.size[0];
vsi_size_t in_height = inputs[0]->attr.size[1];
vsi_size_t out_width = outputs[0]->attr.size[0];
vsi_size_t out_height = outputs[0]->attr.size[1];
float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
float input_tail = -(input_zp * input_scale);
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
float half_pixel_value = 0.0f;
float scale_factor_x = 0.0f;
float scale_factor_y = 0.0f;
vsi_bool is_use_u8_kernel = FALSE;
if (align_corners && out_width > 1)
{
scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
}
else
{
scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
}
if (align_corners && out_height > 1)
{
scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
}
else
{
scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
}
if (half_pixel_centers)
{
half_pixel_value = 0.5f;
}
else
{
half_pixel_value = 0.0f;
}
status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel );
if (VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
size_t node_params_num = RESIZE_CUBIC_NUM;
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, RESIZE_CUBIC_QUANT_NUM,
inputs, input_num, outputs, output_num );
node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x );
node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create(graph, F32, &scale_factor_y );
node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value );
if (is_use_u8_kernel)
{
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input_tail );
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &output_zp );
node_params_num = RESIZE_CUBIC_QUANT_NUM;
}
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
VSI_ASSERT( status == VSI_SUCCESS );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
if (is_use_u8_kernel)
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
}
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( resize_cubic, _setup )

View File

@ -0,0 +1,727 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
__BEGIN_DECLS
typedef enum
{
NONE = 0,
Add,
Mul,
Max,
Min
} vsi_scatter_nd_update_type_e;
/*
* Define kernel meta.
*/
#define KERNEL_SOURCE_1 "scatter_nd_update_reduction"
#define KERNEL_SOURCE_2 "scatter_nd_update_reduction_conv"
#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _stage, _op) \
((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_stage << 4) | (_op))
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(SRC0_TYPE) \
CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_preprocess_"#SRC0_TYPE)
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, SRC2_TYPE) \
CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_"#REDUCTION_TYPE"_"#SRC2_TYPE)
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(DST_TYPE) \
CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_conv_"#DST_TYPE)
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(IN0_TYPE, SOURCE) \
{ HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, 0, 0, 0), \
HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(IN0_TYPE), \
SOURCE },
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(REDUCTION_TYPE, IN2_TYPE, SOURCE) \
{ HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, REDUCTION_TYPE), \
HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, IN2_TYPE), \
SOURCE },
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(OUT_TYPE, SOURCE) \
{ HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 2, 0), \
HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(OUT_TYPE), \
SOURCE },
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type scatter_nd_update_reduction_preprocess_map[] =
{
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(U8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F32, KERNEL_SOURCE_1)
};
static const _kernel_map_type scatter_nd_update_reduction_process_map[] =
{
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, U8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, U8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, U8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, U8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F32, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F32, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F32, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F32, KERNEL_SOURCE_1)
};
static const _kernel_map_type scatter_nd_update_reduction_conv_map[] =
{
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(U8, KERNEL_SOURCE_2)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I8, KERNEL_SOURCE_2)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I16, KERNEL_SOURCE_2)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F16, KERNEL_SOURCE_2)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F32, KERNEL_SOURCE_2)
};
/*
* Kernel params
*/
static vx_param_description_t _scatter_nd_update_preprocess_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
static vx_param_description_t _scatter_nd_update_process_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
static vx_param_description_t _scatter_nd_update_conv_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_preprocess_kernel_param_def)
#define _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_process_kernel_param_def)
#define _SCATTER_ND_UPDATE_CONV_PARAM_NUM _cnt_of_array(_scatter_nd_update_conv_kernel_param_def)
static vsi_status cal_scatter_nd_update_tensor_reshape_size
(
vsi_nn_tensor_t ** inputs,
vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
uint32_t block_size,
uint32_t coordDim,
vsi_size_t strides[VSI_NN_MAX_DIM_NUM],
int32_t* newDim
)
{
vsi_status status = VSI_SUCCESS;
uint32_t dims_num = inputs[0]->attr.dim_num;
vsi_size_t *input_size = inputs[0]->attr.size;
uint32_t i = 0;
vsi_size_t elementCnt = 1;
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
newDim[0] = 0;
for (i = 0; i < dims_num; ++i)
{
elementCnt *= input_size[i];
}
for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
{
sizes[i] = 1;
}
sizes[0] = block_size;
sizes[1] = elementCnt / block_size;
newDim[0] = 2;
if (coordDim == 1 && strides) // index shape
{
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
{
strides[i] = 0;
}
}
else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides)
{
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
{
strides[i] = 0;
}
strides[0] = input_size[dims_num - coordDim];
for (i = 1; i < coordDim - 1; i++)
{
strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i];
}
}
#undef VSI_NN_MAX_IMAGE_WIDTH
return status;
} /* cal_scatter_nd_update_tensor_reshape_size */
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_scatter_nd_update_reduction_preprocess_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
1,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
int32_t width = 0;
int32_t element_size = 1;
int32_t i = 0;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
{
element_size *= (int32_t)attr[0]->shape->data[i];
}
width = element_size / 8;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
if (element_size < 8)
{
gpu_param.global_size[0] = element_size;
}
else
{
gpu_param.global_size[0] = width;
}
gpu_param.global_size[1] = 1;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
} /* _scatter_nd_update_reduction_preprocess_initializer() */
DEF_KERNEL_INITIALIZER(_scatter_nd_update_process_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
2,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
int32_t block_size = 1;
int32_t index_num = 1;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
block_size = (int32_t)(attr[1]->shape->data[0]);
index_num = (int32_t)(attr[0]->shape->data[1]);
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = block_size;
gpu_param.global_size[1] = index_num;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, OnError);
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _scatter_nd_update_process_initializer() */
DEF_KERNEL_INITIALIZER(_scatter_nd_update_conv_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
1,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
int32_t width = 0;
int32_t element_size = 1;
int32_t i = 0;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
{
element_size *= (int32_t)attr[0]->shape->data[i];
}
width = element_size / 8;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
if (element_size < 8)
{
gpu_param.global_size[0] = element_size;
}
else
{
gpu_param.global_size[0] = width;
}
gpu_param.global_size[1] = 1;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, OnError);
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
} /* _scatter_nd_update_conv_initializer() */
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel_preprocess,
vsi_nn_kernel_t* kernel_process,
vsi_nn_kernel_t* kernel_conv,
int32_t reduction_flg
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e input2_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, 0, 0, 0, 0 );
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map); i ++ )
{
if ( scatter_nd_update_reduction_preprocess_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map) )
{
snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s",
scatter_nd_update_reduction_preprocess_map[i].function_name );
kernel_preprocess->info.parameters = _scatter_nd_update_preprocess_kernel_param_def;
kernel_preprocess->info.numParams = _cnt_of_array( _scatter_nd_update_preprocess_kernel_param_def );
kernel_preprocess->info.initialize = _scatter_nd_update_reduction_preprocess_initializer;
// Register code source
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
scatter_nd_update_reduction_preprocess_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
scatter_nd_update_reduction_preprocess_map[i].source_name );
status = VSI_SUCCESS;
}
key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, reduction_flg );
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_process_map); i ++ )
{
if ( scatter_nd_update_reduction_process_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(scatter_nd_update_reduction_process_map) )
{
snprintf( kernel_process->info.name, VX_MAX_KERNEL_NAME, "%s",
scatter_nd_update_reduction_process_map[i].function_name );
kernel_process->info.parameters = _scatter_nd_update_process_kernel_param_def;
kernel_process->info.numParams = _cnt_of_array( _scatter_nd_update_process_kernel_param_def );
kernel_process->info.initialize = _scatter_nd_update_process_initializer;
// Register code source
vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
scatter_nd_update_reduction_process_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
scatter_nd_update_reduction_process_map[i].source_name );
status = VSI_SUCCESS;
}
key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 2, 0 );
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_conv_map); i ++ )
{
if ( scatter_nd_update_reduction_conv_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(scatter_nd_update_reduction_conv_map) )
{
snprintf( kernel_conv->info.name, VX_MAX_KERNEL_NAME, "%s",
scatter_nd_update_reduction_conv_map[i].function_name );
kernel_conv->info.parameters = _scatter_nd_update_conv_kernel_param_def;
kernel_conv->info.numParams = _cnt_of_array( _scatter_nd_update_conv_kernel_param_def );
kernel_conv->info.initialize = _scatter_nd_update_conv_initializer;
// Register code source
vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
scatter_nd_update_reduction_conv_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
scatter_nd_update_reduction_conv_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_t node = NULL;
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
vsi_size_t strides[VSI_NN_MAX_DIM_NUM] = {0};
int32_t coord_strides[VSI_NN_MAX_DIM_NUM] = {0};
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
int32_t reduction = vsi_nn_kernel_param_get_int32( params, "reduction" );
int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
float input_zp_scale = 0 - input_zp * input_scale;
float update_zp = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
float update_scale = vsi_nn_get_tensor_scale(inputs[2]);
float update_zp_scale = 0 - update_zp * update_scale;
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
vsi_nn_tensor_t * tensors[2] = { NULL };
vsi_nn_kernel_t * ikernels[2] = { NULL };
int32_t i = 0;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = cal_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0,
NULL, &rs_idx_dim);
status |= cal_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0,
NULL, &rs_in_dim);
status |= cal_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
strides, &rs_out_dim);
CHECK_STATUS_FAIL_GOTO( status, final );
coord_strides[coord_dim - 1] = 1;
for (i = 0; i < coord_dim - 1; i++)
{
coord_strides[i] = (int32_t)strides[coord_dim - 2 - i];
}
{
vsi_nn_tensor_attr_t attr;
vsi_nn_kernel_node_t preprocess_node = NULL;
vsi_nn_kernel_node_t process_node = NULL;
vsi_nn_kernel_node_param_t preprocess_params[_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_param_t process_params[_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_param_t conv_params[_SCATTER_ND_UPDATE_CONV_PARAM_NUM] = { NULL };
int32_t width = 1;
int32_t res = 0;
int32_t update_width = (int32_t)shapes[1][0];
int32_t output_width = (int32_t)shapes[2][0];
ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
ikernels[0]->unique_id = kernel->unique_id;
ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
ikernels[1]->unique_id = kernel->unique_id;
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
attr.dtype = outputs[0]->attr.dtype;
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
attr.is_const = FALSE;
attr.vtl = TRUE;
for (i = 0; i < rs_out_dim; i++)
{
attr.size[i] = shapes[2][i];
width *= (int32_t)shapes[2][i];
}
attr.dim_num = rs_out_dim;
res = width % 8;
width = (width >> 3) << 3;
tensors[0] = vsi_nn_CreateTensor( graph, &attr ); // ref'
attr.size[0] = 1;
attr.size[1] = 1;
attr.dim_num = rs_out_dim;
tensors[1] = vsi_nn_CreateTensor( graph, &attr ); // link_buffer0
status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], kernel, reduction);
if ( VSI_SUCCESS == status)
{
// convert ref to float
preprocess_node = vsi_nn_kernel_create_node( graph, ikernels[0] );
if (preprocess_node)
{
uint32_t index = 0;
/* Pass parameters to node. */
preprocess_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim );
preprocess_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp_scale );
status = vsi_nn_kernel_node_pass_param( preprocess_node, preprocess_params,
_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &preprocess_params[0] );
vsi_nn_kernel_scalar_release( &preprocess_params[2] );
vsi_nn_kernel_scalar_release( &preprocess_params[3] );
vsi_nn_kernel_scalar_release( &preprocess_params[4] );
vsi_nn_kernel_scalar_release( &preprocess_params[5] );
}
// update
process_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
if (process_node)
{
uint32_t index = 0;
/* Pass parameters to node. */
process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim );
process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim );
process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[0] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[1] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[2] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[3] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[4] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[5] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[6] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &update_width );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_width );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &update_scale );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &update_zp_scale );
status = vsi_nn_kernel_node_pass_param( process_node, process_params,
_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &process_params[0] );
vsi_nn_kernel_tensor_release( &process_params[1] );
vsi_nn_kernel_scalar_release( &process_params[4] );
vsi_nn_kernel_scalar_release( &process_params[5] );
vsi_nn_kernel_scalar_release( &process_params[6] );
vsi_nn_kernel_scalar_release( &process_params[7] );
vsi_nn_kernel_scalar_release( &process_params[8] );
vsi_nn_kernel_scalar_release( &process_params[9] );
vsi_nn_kernel_scalar_release( &process_params[10] );
vsi_nn_kernel_scalar_release( &process_params[11] );
vsi_nn_kernel_scalar_release( &process_params[12] );
vsi_nn_kernel_scalar_release( &process_params[13] );
vsi_nn_kernel_scalar_release( &process_params[14] );
vsi_nn_kernel_scalar_release( &process_params[15] );
}
// convert float to output
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 0;
/* Pass parameters to node. */
conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
conv_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
status = vsi_nn_kernel_node_pass_param( node, conv_params, _SCATTER_ND_UPDATE_CONV_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &conv_params[2] );
vsi_nn_kernel_scalar_release( &conv_params[3] );
vsi_nn_kernel_scalar_release( &conv_params[4] );
vsi_nn_kernel_scalar_release( &conv_params[5] );
vsi_nn_kernel_scalar_release( &conv_params[6] );
}
}
if (preprocess_node) {vsi_nn_kernel_node_release( &preprocess_node );}
if (process_node) {vsi_nn_kernel_node_release( &process_node );}
}
final:
if (ikernels[0])
{
vsi_nn_kernel_release(&ikernels[0]);
}
if (ikernels[1])
{
vsi_nn_kernel_release(&ikernels[1]);
}
vsi_safe_release_tensor(tensors[0]);
vsi_safe_release_tensor(tensors[1]);
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( scatter_nd_update_reduction, _setup )

View File

@ -22,6 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_SELECT_VX_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -359,3 +360,4 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CL( select, _setup )
#endif

View File

@ -22,6 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_TILE_API_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -445,3 +446,4 @@ final:
__END_DECLS
REGISTER_BACKEND_CL( tile, _setup )
#endif

View File

@ -438,7 +438,7 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
int32_t width = (int32_t)block_size;
int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
vsi_bool is_odd_even_sort = FALSE;
size_t param_num = _TOPK_PARAM_NUM;
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);

View File

@ -106,14 +106,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
vsi_nn_kernel_dtype_e output_dtype = F16;
vsi_nn_kernel_tensor_attr_t *input0_attr = NULL, *input1_attr = NULL, *output_attr = NULL;
vsi_size_array_t *input_shape = NULL;
float scaleIn = 1.0f;
int32_t input_ZP = 0;
float scaleIn1 = 1.0f;
int32_t input_ZP1 = 0;
float scaleOut = 1.0f;
int32_t output_ZP = 0;
int32_t fixpoint = 0, fixpoint1 = 0, fixpoint_out = 0;
float inScale_dfp, inScale_dfp1;
float scaleIn = 1.0f;
int32_t input_ZP = 0;
float scaleIn1 = 1.0f;
int32_t input_ZP1 = 0;
float scaleOut = 1.0f;
int32_t output_ZP = 0;
float eps = 0.0f;
float rsEps = 0.0f;
float dimRatio = 0.0f;
@ -135,80 +133,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
rsEps = (float)(1.0f / sqrtf(eps));
dimRatio = (float)(1.0 / (input_shape->data[0]));
if ( VSI_NN_KERNEL_QUANT_DFP == input0_attr->quant )
{
fixpoint = input0_attr->dfp.fl;
}
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input0_attr->quant )
{
input_ZP = input0_attr->asymm.zero_point;
scaleIn = input0_attr->asymm.scale;
}
else
{
input_ZP = 0;
scaleIn = 1.0f;
}
//input1
if ( VSI_NN_KERNEL_QUANT_DFP == input1_attr->quant )
{
fixpoint1 = input1_attr->dfp.fl;
}
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant )
{
input_ZP1 = input1_attr->asymm.zero_point;
scaleIn1 = input1_attr->asymm.scale;
}
else
{
input_ZP1 = 0;
scaleIn1 = 1.0f;
}
//output
if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant )
{
fixpoint_out = output_attr->dfp.fl;
if (fixpoint_out >= 0)
{
scaleOut = 1.0f / (vx_float32) ((int64_t)1 << fixpoint_out);
}
else
{
scaleOut = (vx_float32) ((int64_t)1 << -fixpoint_out);
}
output_ZP = 0;
}
else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
{
output_ZP = output_attr->asymm.zero_point;
scaleOut = output_attr->asymm.scale;
}
else
{
output_ZP = 0;
scaleOut = 1.0f;
}
if (fixpoint >= 0)
{
inScale_dfp = 1.0f / (vx_float32) ((int64_t)1 << fixpoint);
}
else
{
inScale_dfp = (vx_float32) ((int64_t)1 << -fixpoint);
}
if (fixpoint1 >= 0)
{
inScale_dfp1 = 1.0f / (vx_float32) ((int64_t)1 << fixpoint1);
}
else
{
inScale_dfp1 = (vx_float32) ((int64_t)1 << -fixpoint1);
}
scaleIn = input0_attr->scale;
input_ZP = input0_attr->zero_point;
scaleIn1 = input1_attr->scale;
input_ZP1 = input1_attr->zero_point;
scaleOut = output_attr->scale;
output_ZP = output_attr->zero_point;
gpu_param.global_offset[0] = 0;
gpu_param.global_offset[1] = 0;
@ -349,8 +279,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
&uniConvertInt16ScaleToFp32Fst_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16ScaleToFp32Sec_4x4",
&uniConvertInt16ScaleToFp32Sec_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "inScale_i16", &inScale_dfp);
status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &inScale_dfp1);
status |= vsi_nn_kernel_gpu_add_param(node, "inScale_i16", &scaleIn);
status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &scaleIn1);
CHECK_STATUS_FAIL_GOTO(status, final );
}
width = (int32_t)input_shape->data[0];

View File

@ -215,41 +215,11 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer)
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = input_attr->dfp.fl;
if (fl > 0)
{
input_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input_scale = (float)((int64_t)1 << -fl);
}
}
else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input_scale = input_attr->asymm.scale;
input_tail = 0 - input_scale * (float)input_attr->asymm.zero_point;
}
input_scale = input_attr->scale;
input_tail = 0 - input_scale * (float)input_attr->zero_point;
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = output_attr->dfp.fl;
if (fl > 0)
{
output_scale = (float) ((int64_t)1 << fl);
}
else
{
output_scale = 1.0f / (float)((int64_t)1 << -fl);
}
}
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
output_scale = 1.0f / output_attr->asymm.scale;
output_zp = (float)output_attr->asymm.zero_point;
}
output_scale = 1.0f / output_attr->scale;
output_zp = (float)output_attr->zero_point;
pack_key = _PACK_BATCH_NORM_KEY( input_attr->dtype, output_attr->dtype );

View File

@ -121,23 +121,20 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
vsi_nn_kernel_dtype_e output_dtype = F16;
uint32_t depth = 0;
float half_input0_wh[2];
float add_float_value[2];
uint32_t in0_width;
uint32_t in0_height;
uint32_t out_width;
uint32_t out_height;
int32_t align_corners;
float half_input0_wh[2] = {0};
float add_float_value[2] = {0};
uint32_t in0_width = 0;
uint32_t in0_height = 0;
uint32_t out_width = 0;
uint32_t out_height = 0;
int32_t align_corners = 0;
int32_t src0FixPointPos = 0;
int32_t src1FixPointPos = 0;
int32_t dstFixPointPos = 0;
float input0_scale = 1.0;
int32_t input0ZP = 0;
float input1_scale = 1.0;
int32_t input1ZP = 0;
float output_scale = 1.0;
int32_t outputZP = 0;
float input0_scale = 1.0;
int32_t input0ZP = 0;
float input1_scale = 1.0;
int32_t input1ZP = 0;
float output_scale = 1.0;
int32_t outputZP = 0;
VSI_UNREFERENCED(param_size);
@ -165,54 +162,14 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
input1_dtype = input_attr[1]->dtype;
output_dtype = output_attr->dtype;
if (U8 == input0_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant) {
input0_scale = input_attr[0]->asymm.scale;
input0ZP = input_attr[0]->asymm.zero_point;
} else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant) {
src0FixPointPos = input_attr[0]->dfp.fl;
if (src0FixPointPos >= 0) {
input0_scale = 1.0f / (float)((int64_t)1 << src0FixPointPos);
} else if (src0FixPointPos < 0) {
input0_scale = (float)((int64_t)1 << -src0FixPointPos);
}
input0ZP = 0;
} else {
input0_scale = 1.0f;
input0ZP = 0;
}
input0_scale = input_attr[0]->scale;
input0ZP = input_attr[0]->zero_point;
if (U8 == input1_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[1]->quant) {
input1_scale = input_attr[1]->asymm.scale;
input1ZP = input_attr[1]->asymm.zero_point;
} else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[1]->quant) {
src1FixPointPos = input_attr[1]->dfp.fl;
if (src1FixPointPos >= 0) {
input1_scale = 1.0f / (float)((int64_t)1 << src1FixPointPos);
} else if (src1FixPointPos < 0) {
input1_scale = (float)((int64_t)1 << -src1FixPointPos);
}
input1ZP = 0;
} else {
input1_scale = 1.0f;
input1ZP = 0;
}
if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) {
output_scale = output_attr->asymm.scale;
outputZP = output_attr->asymm.zero_point;
} else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) {
dstFixPointPos = output_attr->dfp.fl;
if (dstFixPointPos >= 0) {
output_scale = (float)((int64_t)1 << dstFixPointPos);
} else if (dstFixPointPos < 0) {
output_scale = 1.0f / (float)((int64_t)1 << -dstFixPointPos);
}
outputZP = 0;
} else {
output_scale = 1.0;
outputZP = 0;
}
input1_scale = input_attr[1]->scale;
input1ZP = input_attr[1]->zero_point;
output_scale = output_attr->scale;
outputZP = output_attr->zero_point;
in0_width = (uint32_t)(in0_shape->data[0]);
in0_height = (uint32_t)(in0_shape->data[1]);
@ -496,7 +453,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
I16 == output_dtype)) ||
((I8 == input0_dtype && I8 == input1_dtype &&
I8 == output_dtype))) {
float dfpScale = input0_scale * output_scale;
float dfpScale = input0_scale / output_scale;
gpu_dp_inst_t uniDFPtoFp32_part0_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt

View File

@ -179,7 +179,6 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
srcFixPointPos = input_attr->dfp.fl;

View File

@ -22,6 +22,7 @@
*
*****************************************************************************/
#if !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -319,41 +320,10 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
out_shape = attr[2]->shape;
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[0]->dfp.fl;
if (fl > 0)
{
input0Scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input0Scale = (float)((int64_t)1 << -fl);
}
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input0Scale = attr[0]->asymm.scale;
input0Tail = 0 - attr[0]->asymm.zero_point * input0Scale;
}
if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[1]->dfp.fl;
if (fl > 0)
{
input1Scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input1Scale = (float)((int64_t)1 << -fl);
}
}
else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input1Scale = attr[1]->asymm.scale;
input1Tail = 0 - attr[1]->asymm.zero_point * input1Scale;
}
input0Scale = attr[0]->scale;
input0Tail = 0 - attr[0]->zero_point * input0Scale;
input1Scale = attr[1]->scale;
input1Tail = 0 - attr[1]->zero_point * input1Scale;
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
@ -616,3 +586,4 @@ final:
REGISTER_BACKEND_EVIS( relational_ops, _setup )
__END_DECLS
#endif

View File

@ -152,23 +152,12 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer)
out_shape = output_attr->shape;
weight_shape = weights_attr->shape;
if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
{
input_ZP = input_attr->asymm.zero_point;
scaleIn = input_attr->asymm.scale;
}
if ( VSI_NN_KERNEL_QUANT_ASYMM == weights_attr->quant )
{
weight_ZP = weights_attr->asymm.zero_point;
scaleWights = weights_attr->asymm.scale;
}
if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
{
output_ZP = (float)output_attr->asymm.zero_point;
scaleOut = output_attr->asymm.scale;
}
input_ZP = input_attr->zero_point;
scaleIn = input_attr->scale;
weight_ZP = weights_attr->zero_point;
scaleWights = weights_attr->scale;
output_ZP = (float)output_attr->zero_point;
scaleOut = output_attr->scale;
scaleOut = (scaleIn * scaleWights) / scaleOut;
input_height = (int32_t)(in_shape->data[1]);

View File

@ -0,0 +1,540 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#include "utils/vsi_nn_dtype_util.h"
__BEGIN_DECLS
typedef enum _crop_and_resize_type_e
{
nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
bilinear = VSI_NN_INTERPOLATION_BILINEAR,
}crop_and_resize_type_e;
#define _CROP_AND_RESIZE_KERNEL_SOURCE_NAME "crop_and_resize_"
// Add kernel hashtable here
#define CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
(( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8) | (RESIZE_METHOD))
#define CROP_AND_RESIZE_KERNEL( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
{ CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ), \
CVIVANTE_NAMESPACE("evis.crop_and_resize_"#RESIZE_METHOD"_"#IN_DTYPE"to"#OUT_DTYPE), \
_CROP_AND_RESIZE_KERNEL_SOURCE_NAME#RESIZE_METHOD }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _crop_and_resize_kernel_map[] =
{
// Register kernel here
CROP_AND_RESIZE_KERNEL( U8, U8, nearest_neighbor ),
CROP_AND_RESIZE_KERNEL( U8, F16, nearest_neighbor ),
CROP_AND_RESIZE_KERNEL( F16, F16, nearest_neighbor),
CROP_AND_RESIZE_KERNEL( F16, U8, nearest_neighbor ),
CROP_AND_RESIZE_KERNEL( F16, I8, nearest_neighbor),
CROP_AND_RESIZE_KERNEL( I8, I8, nearest_neighbor ),
CROP_AND_RESIZE_KERNEL( I8, F16, nearest_neighbor),
CROP_AND_RESIZE_KERNEL( I16, I16, nearest_neighbor ),
CROP_AND_RESIZE_KERNEL( I16, F16, nearest_neighbor),
CROP_AND_RESIZE_KERNEL( U8, U8, bilinear),
CROP_AND_RESIZE_KERNEL( U8, F16, bilinear),
CROP_AND_RESIZE_KERNEL( F16, F16, bilinear),
CROP_AND_RESIZE_KERNEL( F16, U8, bilinear),
CROP_AND_RESIZE_KERNEL( F16, I8, bilinear),
CROP_AND_RESIZE_KERNEL( I8, I8, bilinear),
CROP_AND_RESIZE_KERNEL( I8, F16, bilinear),
CROP_AND_RESIZE_KERNEL( I16, I16, bilinear),
CROP_AND_RESIZE_KERNEL( I16, F16, bilinear),
};
/*
* Kernel params
*/
static vx_param_description_t _crop_and_resize_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _CROP_AND_RESIZE_PARAM_NUM _cnt_of_array( _crop_and_resize_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_crop_and_resize_nearest_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
};
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
int32_t crop_width = 0;
int32_t crop_height = 0;
int32_t image_width = 0;
int32_t image_height = 0;
int32_t batch_out = 0;
float width_scale = 0;
float height_scale = 0;
float src0ZP = 0;
float src0Scale = 1;
float dstZP = 0;
float dstScale = 1;
float inOutScale = 0;
float inOutTile = 0;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
CHECK_STATUS_FAIL_GOTO(status, final );
src0Scale = attr[0]->scale;
src0ZP = (float)attr[0]->zero_point;
dstScale = attr[1]->scale;
dstZP = (float)attr[1]->zero_point;
inOutScale = src0Scale / dstScale;
inOutTile = dstZP - inOutScale * src0ZP;
image_width = (int32_t)(attr[0]->shape->data[0]);
image_height = (int32_t)(attr[0]->shape->data[1]);
crop_width = (int32_t)(attr[1]->shape->data[0]);
crop_height = (int32_t)(attr[1]->shape->data[1]);
width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2((crop_width + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 8);
gpu_param.global_size[1] = (crop_height + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1];
gpu_param.global_size[2] = (batch_out + gpu_param.global_scale[2] - 1)
/ gpu_param.global_scale[2];
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
{
gpu_dp_inst_t uniExtract8Bit_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00002100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertFstToFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvertSecToFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale );
status |= vsi_nn_kernel_gpu_add_param( node, "inOutTile", &inOutTile );
status |= vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
CHECK_STATUS_FAIL_GOTO(status, final);
}
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _crop_and_resize_nearest_initializer() */
DEF_KERNEL_INITIALIZER(_crop_and_resize_bilinear_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
};
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
int32_t crop_width = 0;
int32_t crop_height = 0;
int32_t image_width = 0;
int32_t image_height = 0;
int32_t batch_out = 0;
float width_scale = 0;
float height_scale = 0;
float src0ZP = 0;
float src0Scale = 1;
float dstZP = 0;
float dstScale = 1;
float inOutScale = 0;
float inOutTile = 0;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
CHECK_STATUS_FAIL_GOTO(status, final );
src0Scale = attr[0]->scale;
src0ZP = (float)attr[0]->zero_point;
dstScale = attr[1]->scale;
dstZP = (float)attr[1]->zero_point;
inOutScale = src0Scale / dstScale;
inOutTile = dstZP - inOutScale * src0ZP;
image_width = (int32_t)(attr[0]->shape->data[0]);
image_height = (int32_t)(attr[0]->shape->data[1]);
crop_width = (int32_t)(attr[1]->shape->data[0]);
crop_height = (int32_t)(attr[1]->shape->data[1]);
width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2((crop_width + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (crop_height + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1];
gpu_param.global_size[2] = (batch_out + gpu_param.global_scale[2] - 1)
/ gpu_param.global_scale[2];
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
{
gpu_dp_inst_t uniExtract8Bit_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00002100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniRightToFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00030001, 0x00070005, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniLeftToFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001,
0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param( node, "uniRightToFp32_4x4", &uniRightToFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFp32_4x4", &uniLeftToFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale );
status |= vsi_nn_kernel_gpu_add_param( node, "inOutTile", &inOutTile );
status |= vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
CHECK_STATUS_FAIL_GOTO(status, final);
}
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _crop_and_resize_bilinear_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t resize_method
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _crop_and_resize_kernel_map;
size_t kernel_map_size = _cnt_of_array( _crop_and_resize_kernel_map );
vx_param_description_t * param_def = _crop_and_resize_kernel_param_def;
vx_kernel_initialize_f initializer = _crop_and_resize_nearest_initializer;
uint32_t key;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (resize_method == bilinear)
{
initializer = _crop_and_resize_bilinear_initializer;
}
key = CROP_AND_RESIZE_HASH_KEY( in_dtype, out_dtype, resize_method );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _crop_and_resize_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CROP_AND_RESIZE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
uint32_t ori_depth = (uint32_t)inputs[0]->attr.size[2];
uint32_t ori_batchout = (uint32_t)outputs[0]->attr.size[3];
float extrapolation_value = vsi_nn_kernel_param_get_float32( params, "extrapolation_value" );
int32_t resize_method = vsi_nn_kernel_param_get_int32( params, "resize_method" );
VSI_UNREFERENCED(params);
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
shapes[0][0] = inputs[0]->attr.size[0];
shapes[0][1] = inputs[0]->attr.size[1];
shapes[0][2] = inputs[0]->attr.size[2] * inputs[0]->attr.size[3];
shapes[1][0] = outputs[0]->attr.size[0];
shapes[1][1] = outputs[0]->attr.size[1];
shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
if (rs_input == NULL || rs_output == NULL)
{
goto final;
}
status = _query_kernel( kernel, inputs, outputs, resize_method );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
node_params[0] = rs_input;
node_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t);
node_params[2] = (vsi_nn_kernel_node_param_t)(inputs[2]->t);
node_params[3] = rs_output;
node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &ori_depth );
node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &ori_batchout );
status = vsi_nn_kernel_node_pass_param( node, node_params, _CROP_AND_RESIZE_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
}
{
// Set default border mode.
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
vsi_nn_Float32ToDtype(extrapolation_value, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
CHECK_STATUS(status);
}
}
final:
if (rs_input)
{
vsi_nn_kernel_tensor_release( &rs_input );
}
if (rs_output)
{
vsi_nn_kernel_tensor_release( &rs_output );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( crop_and_resize, _setup )

View File

@ -204,39 +204,11 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse);
CHECK_STATUS_FAIL_GOTO(status, OnError );
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
input_scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
input_scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input_scale = attr[0]->asymm.scale;
input_zp = attr[0]->asymm.zero_point;
}
input_scale = attr[0]->scale;
input_zp = attr[0]->zero_point;
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[1]->dfp.fl > 0)
{
output_scale = (float)((int64_t)1 << attr[1]->dfp.fl);
}
else
{
output_scale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
}
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
output_scale = 1.0f / attr[1]->asymm.scale;
output_zp = (float)attr[1]->asymm.zero_point;
}
output_scale = 1.0f / attr[1]->scale;
output_zp = (float)attr[1]->zero_point;
in_out_scale = input_scale * output_scale;
in_out_zp_scale = (float)in_out_scale * input_zp * (-1);

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -161,51 +161,10 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size);
CHECK_STATUS_FAIL_GOTO(status, OnError );
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
src0ZP = attr[0]->asymm.zero_point;
src0Scale = attr[0]->asymm.scale;
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
src0ZP = 0;
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
src0Scale = 1;
src0ZP = 0;
}
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
dstZP = attr[1]->asymm.zero_point;
dstScale = attr[1]->asymm.scale;
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[1]->dfp.fl > 0)
{
dstScale = (1.0f / (float)((int64_t)1 << attr[1]->dfp.fl));
}
else
{
dstScale = (float)((int64_t)1 << -attr[1]->dfp.fl);
}
dstZP = 0;
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
dstScale = 1;
dstZP = 0;
}
src0ZP = attr[0]->zero_point;
src0Scale = attr[0]->scale;
dstZP = attr[1]->zero_point;
dstScale = attr[1]->scale;
output_dims = (uint32_t)attr[1]->shape->size;
output_width = (int32_t)(attr[1]->shape->data[0]);
@ -454,4 +413,4 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_EVIS( depth2space_internal, _setup )
#endif

View File

@ -250,12 +250,12 @@ DEF_KERNEL_INITIALIZER(_depthwise_conv1d_initializer)
gpu_param.global_size[1] = gpu_align_p2((output_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1], gpu_param.local_size[1]);
outputScale = input_attr->asymm.scale;
outputScale = input_attr->scale;
outputScale *= weight_attr->asymm.scale;
weightZP = weight_attr->asymm.zero_point;
outputScale /= output_attr->asymm.scale;
outputZP = (float)output_attr->asymm.zero_point + 0.5f;
outputScale *= weight_attr->scale;
weightZP = weight_attr->zero_point;
outputScale /= output_attr->scale;
outputZP = (float)output_attr->zero_point + 0.5f;
#define _PACK_SELECT_KEY( kernel_size, dilation, evis_version ) \
((uint64_t)kernel_size | ((uint64_t)dilation << 16) | ((uint64_t)evis_version << 32))

View File

@ -135,17 +135,10 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
status = vsi_nn_kernel_gpu_add_param( node, "logE", &logE);
CHECK_STATUS_FAIL_GOTO(status, final );
if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
{
input0_ZP = input_attr->asymm.zero_point;
scaleIn0 = input_attr->asymm.scale;
}
if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant )
{
input1_ZP = input1_attr->asymm.zero_point;
scaleIn1 = input1_attr->asymm.scale;
}
input0_ZP = input_attr->zero_point;
scaleIn0 = input_attr->scale;
input1_ZP = input1_attr->zero_point;
scaleIn1 = input1_attr->scale;
if ((F32 == input_attr->dtype) || (F32 == input1_attr->dtype))
{

View File

@ -60,6 +60,7 @@ typedef enum
UNARY_ATANH,
UNARY_ACOSH,
UNARY_INVERSE_SIGMOID,
UNARY_TAN,
} unary_type_e;
/*
@ -108,6 +109,7 @@ typedef enum
#define ATANH_OPERATION atanh
#define ACOSH_OPERATION acosh
#define INVERSE_SIGMOID_OPERATION inverse_sigmoid
#define TAN_OPERATION tan
#define ADD_UNARY_SH_KERNELS(name, source) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_3D) \
@ -153,6 +155,7 @@ static const struct {
ADD_UNARY_SH_KERNELS(ATAN, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(ATANH, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(ACOSH, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(TAN, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(HSIGMOID, KERNEL_SOURCE0)
ADD_UNARY_SH_KERNELS(MISH, KERNEL_SOURCE0)
@ -177,6 +180,7 @@ static const struct {
#undef RCP_OPERATION
#undef SIGN_OPERATION
#undef SOFTSIGN_OPERATION
#undef TAN_OPERATION
/*
* Kernel params
*/
@ -243,41 +247,10 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
}
out_shape = attr[1]->shape;
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[0]->dfp.fl;
if (fl > 0)
{
inputScale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
inputScale = (float)((int64_t)1 << -fl);
}
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
inputScale = attr[0]->asymm.scale;
inputTail = 0 - attr[0]->asymm.zero_point * inputScale;
}
if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[1]->dfp.fl;
if (fl > 0)
{
outputScale = (float)((int64_t)1 << fl);
}
else
{
outputScale = (float)1.0f / (float) ((int64_t)1 << -fl);
}
}
else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
outputScale = (float)1.0f / attr[1]->asymm.scale;
outputZP = (float)attr[1]->asymm.zero_point;
}
inputScale = attr[0]->scale;
inputTail = 0 - attr[0]->zero_point * inputScale;
outputScale = (float)1.0f / attr[1]->scale;
outputZP = (float)attr[1]->zero_point;
#define _PACK_SELECT_KEY( TYPE, IN_TYPE, OUT_TYPE ) \
(( TYPE << 24) | ( IN_TYPE << 16) | ( OUT_TYPE << 8))
@ -298,17 +271,23 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
switch( pack_key )
{
#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
case _PACK_SELECT_KEY( UNARY_SIN, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_COS, BF16, BF16 ):
#endif
#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
case _PACK_SELECT_KEY( UNARY_EXP, BF16, BF16 ):
#endif
case _PACK_SELECT_KEY( UNARY_LOG, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_SELU, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_NEG, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ):
#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ):
#endif
case _PACK_SELECT_KEY( UNARY_CELU, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_RCP, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_SIGN, BF16, BF16 ):
@ -317,6 +296,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
case _PACK_SELECT_KEY( UNARY_ATANH, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_ACOSH, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_INVERSE_SIGMOID, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_TAN, BF16, BF16 ):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
@ -614,16 +594,22 @@ OnError:
} \
REGISTER_BACKEND_EVIS( KERNEL_NAME, _##KERNEL_NAME##_setup )
#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sin, UNARY_SIN )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( cos, UNARY_COS )
#endif
#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( exp, UNARY_EXP )
#endif
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( log, UNARY_LOG )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND )
#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU )
#endif
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( selu, UNARY_SELU )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( rcp, UNARY_RCP )
@ -633,5 +619,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atan, UNARY_ATAN )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atanh, UNARY_ATANH )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( acosh, UNARY_ACOSH )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( inverse_sigmoid, UNARY_INVERSE_SIGMOID )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( tan, UNARY_TAN )
__END_DECLS

View File

@ -145,41 +145,10 @@ DEF_KERNEL_INITIALIZER(_erf_initializer)
out_shape = attr[1]->shape;
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[0]->dfp.fl;
if (fl > 0)
{
inputScale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
inputScale = (float)((int64_t)1 << -fl);
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
inputScale = attr[0]->asymm.scale;
inputTail = 0 - attr[0]->asymm.zero_point * inputScale;
}
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[1]->dfp.fl;
if (fl > 0)
{
outputScale = (float)((int64_t)1 << fl);
}
else
{
outputScale = (float)1.0f / (float) ((int64_t)1 << -fl);
}
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
outputScale = (float)1.0f / attr[1]->asymm.scale;
outputZP = (float)attr[1]->asymm.zero_point;
}
inputScale = attr[0]->scale;
inputTail = 0 - (float)attr[0]->zero_point * inputScale;
outputScale = (float)1.0f / attr[1]->scale;
outputZP = (float)attr[1]->zero_point;
#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \
( ( IN_TYPE << 16) | ( OUT_TYPE << 8))

View File

@ -129,9 +129,6 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
vsi_nn_kernel_dtype_e input0_dtype = F16;
int32_t input0_fl = 0;
int32_t input1_fl = 0;
int32_t output_fl = 0;
float inScale0 = 1.0f;
float inScale1 = 1.0f;
float outScale = 1.0f;
@ -169,59 +166,12 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
(output_shape->data[2] + gpu_param.global_scale[2] - 1)
/ gpu_param.global_scale[2] : 1;
if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
input0_fl = input0_attr->dfp.fl;
if (input0_fl > 0)
{
inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
}
else
{
inScale0 = (float)((int64_t)1 << -input0_fl);
}
}
else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
inScale0 = input0_attr->asymm.scale;
in0Tail = -inScale0 * ((float)input0_attr->asymm.zero_point);
}
if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
input1_fl = input1_attr->dfp.fl;
if (input1_fl > 0)
{
inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
}
else
{
inScale1 = (float)((int64_t)1 << -input1_fl);
}
}
else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
inScale1 = input1_attr->asymm.scale;
in1Tail = -inScale1 * ((float)input1_attr->asymm.zero_point);
}
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
output_fl = output_attr->dfp.fl;
if (output_fl > 0)
{
outScale = (float) ((int64_t)1 << output_fl);
}
else
{
outScale = 1.0f / (float)((int64_t)1 << -output_fl);
}
}
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
outScale = 1.0f / output_attr->asymm.scale;
outZp = (float)(output_attr->asymm.zero_point);
}
inScale0 = input0_attr->scale;
in0Tail = 0 - inScale0 * ((float)input0_attr->zero_point);
inScale1 = input1_attr->scale;
in1Tail = 0 - inScale1 * ((float)input1_attr->zero_point);
outScale = 1.0f / output_attr->scale;
outZp = (float)(output_attr->zero_point);
if (BF16 == input0_dtype)
{

View File

@ -22,7 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_GATHER_API_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -202,6 +202,7 @@ static vx_param_description_t _gather_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _GATHER_PARAM_NUM _cnt_of_array( _gather_kernel_param_def )
@ -285,6 +286,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
int32_t indices_num = 1;
uint32_t input_dims1 = 0;
int32_t batch = 1;
int32_t is_array = 0;
vx_uint32 i = 0;
vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
vsi_size_array_t * input1_shape = NULL;
@ -308,40 +310,13 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &is_array);
CHECK_STATUS_FAIL_GOTO(status, OnError );
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
src0Scale = attr[0]->asymm.scale;
src0ZP = attr[0]->asymm.zero_point;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[2]->dfp.fl > 0)
{
dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
}
else
{
dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
}
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
dstScale = 1.0f / attr[2]->asymm.scale;
dstZP = attr[2]->asymm.zero_point;
}
src0Scale = attr[0]->scale;
src0ZP = attr[0]->zero_point;
dstScale = 1.0f / attr[2]->scale;
dstZP = attr[2]->zero_point;
input1_shape = attr[1]->shape;
input_dims1 = (uint32_t)input1_shape->size;
@ -358,8 +333,16 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
}
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((block_size + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
if (is_array)
{
shaderParam.global_size[0] = (block_size + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0];
}
else
{
shaderParam.global_size[0] = gpu_align_p2((block_size + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
}
shaderParam.global_size[1] = indices_num;
shaderParam.global_size[2] = block_num;
@ -508,39 +491,10 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
CHECK_STATUS_FAIL_GOTO(status, OnError );
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
src0Scale = attr[0]->asymm.scale;
src0ZP = attr[0]->asymm.zero_point;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[2]->dfp.fl > 0)
{
dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
}
else
{
dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
}
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
dstScale = 1.0f / attr[2]->asymm.scale;
dstZP = attr[2]->asymm.zero_point;
}
src0Scale = attr[0]->scale;
src0ZP = attr[0]->zero_point;
dstScale = 1.0f / attr[2]->scale;
dstZP = attr[2]->zero_point;
input1_shape = attr[1]->shape;
input_dims1 = (uint32_t)input1_shape->size;
@ -661,8 +615,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
{
status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
}
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
if (indices_num > GPU_TENSOR_MAX_WIDTH || block_num > GPU_TENSOR_MAX_WIDTH)
{
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
}
CHECK_STATUS_FAIL_GOTO(status, OnError );
OnError:
@ -841,6 +798,7 @@ static vsi_nn_kernel_node_t _setup
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is_array );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _GATHER_PARAM_NUM );
vsi_nn_kernel_scalar_release( &tmp_params[3] );
vsi_nn_kernel_scalar_release( &tmp_params[4] );
@ -859,3 +817,4 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_EVIS( gather, _setup )
#endif

View File

@ -290,39 +290,10 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size);
CHECK_STATUS_FAIL_GOTO(status, OnError );
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
src0Scale = attr[0]->asymm.scale;
src0ZP = attr[0]->asymm.zero_point;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[2]->dfp.fl > 0)
{
dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
}
else
{
dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
}
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
dstScale = 1.0f / attr[2]->asymm.scale;
dstZP = attr[2]->asymm.zero_point;
}
src0Scale = attr[0]->scale;
src0ZP = attr[0]->zero_point;
dstScale = 1.0f / attr[2]->scale;
dstZP = attr[2]->zero_point;
indices_num = (int32_t)(attr[1]->shape->data[1]);
batch_num = (int32_t)(attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1);

View File

@ -238,7 +238,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
float tensorZP[4] = {0.0f, 0.0f, 0.0f, 0.0f};
uint32_t i = 0;
uint32_t pack_key = 0;
vsi_size_array_t * output_shape = NULL;
vsi_size_array_t * output_shape = NULL;
vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL, NULL, NULL, NULL };
VSI_UNREFERENCED(param_size);
@ -254,12 +254,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
for (i = 0; i < 4; i++)
{
if( attr[i]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[i]->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
tensorZP[i] = (float)attr[i]->asymm.zero_point;
tensorScale[i] = attr[i]->asymm.scale;
}
tensorZP[i] = (float)attr[i]->zero_point;
tensorScale[i] = attr[i]->scale;
}
tensorZP[0] = tensorScale[0] * tensorZP[0];
@ -459,63 +455,31 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_cdnn_initializer)
output_shape = attr[3]->shape;
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
input_scale = attr[0]->asymm.scale;
input_tail = 0 - input_scale * (float)attr[0]->asymm.zero_point;
}
input_scale = attr[0]->scale;
input_tail = 0 - input_scale * (float)attr[0]->zero_point;
if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
input_r_scale = attr[1]->asymm.scale;
input_r_tail = 0 - input_r_scale * (float)attr[1]->asymm.zero_point;
}
input_r_scale = attr[1]->scale;
input_r_tail = 0 - input_r_scale * (float)attr[1]->zero_point;
if( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
recur_r_scale = attr[2]->asymm.scale;
recur_r_tail = 0 - recur_r_scale * (float)attr[2]->asymm.zero_point;
}
recur_r_scale = attr[2]->scale;
recur_r_tail = 0 - recur_r_scale * (float)attr[2]->zero_point;
if( attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[3]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
output_scale = 1.0f / attr[3]->asymm.scale;
output_zp = (float)attr[3]->asymm.zero_point;
}
output_scale = 1.0f / attr[3]->scale;
output_zp = (float)attr[3]->zero_point;
if ( param_size == _GRUCELL_CDNN_SEP_ACTIVATION_PARAM_NUM )
{
if( attr[4]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[4]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
input_z_scale = attr[4]->asymm.scale;
input_z_tail = 0 - input_z_scale * (float)attr[4]->asymm.zero_point;
}
input_z_scale = attr[4]->scale;
input_z_tail = 0 - input_z_scale * (float)attr[4]->zero_point;
if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
recur_z_scale = attr[5]->asymm.scale;
recur_z_tail = 0 - recur_z_scale * (float)attr[5]->asymm.zero_point;
}
recur_z_scale = attr[5]->scale;
recur_z_tail = 0 - recur_z_scale * (float)attr[5]->zero_point;
if( attr[6]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[6]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
input_c_scale = attr[6]->asymm.scale;
input_c_tail = 0 - input_c_scale * (float)attr[6]->asymm.zero_point;
}
input_c_scale = attr[6]->scale;
input_c_tail = 0 - input_c_scale * (float)attr[6]->zero_point;
if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
recur_c_scale = attr[7]->asymm.scale;
recur_c_tail = 0 - recur_c_scale * (float)attr[7]->asymm.zero_point;
}
recur_c_scale = attr[7]->scale;
recur_c_tail = 0 - recur_c_scale * (float)attr[7]->zero_point;
}
if (layer_out == 1 || layer_out == 2)

View File

@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
float hstate_in_tail = 0;
float output_scale = 1.0f;
float output_zp = 0;
float output_scale1 = 1.0f;
float output_zp1 = 0;
uint32_t i = 0;
uint32_t pack_key = 0;
vsi_nn_kernel_tensor_attr_t* input_attr[GRUCELL_ACT_Z_H_IN_CNT] = {NULL};
@ -142,33 +144,14 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out );
CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final );
if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
{
int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
if (srcFixPointPos >= 0)
hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
else if (srcFixPointPos < 0)
hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
}
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
{
hstate_in_scale = input_attr[0]->asymm.scale;
hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
}
hstate_in_scale = input_attr[0]->scale;
hstate_in_tail = -(float)input_attr[0]->zero_point * hstate_in_scale;
if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant )
{
int8_t dstFixPointPos = (int8_t)output_attr[0]->dfp.fl;
if (dstFixPointPos >= 0)
output_scale *= (vx_float32)((int64_t)1 << dstFixPointPos);
else if (dstFixPointPos < 0)
output_scale *= 1.0f / (vx_float32) ((int64_t)1 << - dstFixPointPos);
}
else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
{
output_scale = 1.0f / output_attr[0]->asymm.scale;
output_zp = (float)output_attr[0]->asymm.zero_point;
}
output_scale = 1.0f / output_attr[0]->scale;
output_zp = (float)output_attr[0]->zero_point;
output_scale1 = 1.0f / output_attr[1]->scale;
output_zp1 = (float)output_attr[1]->zero_point;
pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);
@ -290,6 +273,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_tail", &hstate_in_tail);
status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "output_scale1", &output_scale1);
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp1", &output_zp1);
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;

View File

@ -132,19 +132,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
output_attr[0] = vsi_nn_kernel_tensor_attr_create( output );
CHECK_PTR_FAIL_GOTO( output_attr[0], "Create tensor attr buffer fail.", final );
if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
{
int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
if (srcFixPointPos >= 0)
hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
else if (srcFixPointPos < 0)
hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
}
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
{
hstate_in_scale = input_attr[0]->asymm.scale;
hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
}
hstate_in_scale = input_attr[0]->scale;
hstate_in_tail = 0 - (float)input_attr[0]->zero_point * hstate_in_scale;
pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);

View File

@ -47,6 +47,7 @@ typedef enum _grucell_nn_activation_type_e
SIGMOID = VSI_NN_ACT_SIGMOID,
HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
TANH = VSI_NN_ACT_TANH,
RELU = VSI_NN_ACT_RELU,
}grucell_nn_activation_type_e;
#define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE "grucell_reset_after_activation"
@ -80,6 +81,11 @@ static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ),
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ),
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, SIGMOID ),
PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, RELU ),
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, RELU ),
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, RELU ),
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, RELU ),
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, RELU ),
};
@ -148,33 +154,11 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out );
CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final );
if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
{
int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
if (srcFixPointPos >= 0)
hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
else if (srcFixPointPos < 0)
hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
}
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
{
hstate_in_scale = input_attr[0]->asymm.scale;
hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
}
hstate_in_scale = input_attr[0]->scale;
hstate_in_tail = -(float)input_attr[0]->zero_point * hstate_in_scale;
if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant )
{
int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
if (srcFixPointPos >= 0)
output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos);
else if (srcFixPointPos < 0)
output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos);
}
else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
{
output_scale = 1.0f / output_attr[0]->asymm.scale;
output_zp = (float)output_attr[0]->asymm.zero_point;
}
output_scale = 1.0f / output_attr[0]->scale;
output_zp = (float)output_attr[0]->zero_point;
pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);

View File

@ -127,10 +127,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
vsi_size_array_t * output_shape = NULL;
vsi_nn_kernel_dtype_e input_dtype = F16;
vsi_nn_kernel_dtype_e output_dtype = F16;
int32_t input_fl = 0;
int32_t inputZP = 0;
float inputScale = 1.0f;
int32_t output_fl = 0;
int32_t outputZP = 0;
float outputScale = 1.0f;
float r_inputScale = 1.0f;
@ -153,41 +151,11 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
input_dtype = input_attr->dtype;
output_dtype = output_attr->dtype;
if ( VSI_NN_KERNEL_QUANT_DFP == input_attr->quant )
{
input_fl = input_attr->dfp.fl;
if (input_fl >= 0)
{
inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
}
else
{
inputScale = (float) ((int64_t)1 << -input_fl);
}
}
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
{
inputZP = input_attr->asymm.zero_point;
inputScale = input_attr->asymm.scale;
}
inputZP = input_attr->zero_point;
inputScale = input_attr->scale;
if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant )
{
output_fl = output_attr->dfp.fl;
if (output_fl >= 0)
{
outputScale = (float) ((int64_t)1 << output_fl);
}
else
{
outputScale = 1.0f / (float) ((int64_t)1 << -output_fl);
}
}
else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
{
outputZP = output_attr->asymm.zero_point;
outputScale = 1.0f / output_attr->asymm.scale;
}
outputZP = output_attr->zero_point;
outputScale = 1.0f / output_attr->scale;
e2InScale = inputScale * inputScale;
r_inputScale = 1.0f / inputScale;

View File

@ -22,6 +22,7 @@
*
*****************************************************************************/
#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -42,7 +43,11 @@ __BEGIN_DECLS
#define SOURCE_AXIS0_1 "layer_normalization_1"
#define SOURCE_AXIS0_2 "layer_normalization_2"
#define SOURCE_AXIS0_3 "layer_normalization_3"
#define SOURCE_AXIS01 "layer_normalization_axis01"
#define SOURCE_AXIS01_SUM "layer_normalization_axis01_sum"
#define SOURCE_AXIS01_0 "layer_normalization_axis01_0"
#define SOURCE_AXIS01_1 "layer_normalization_axis01_1"
#define SOURCE_AXIS01_2 "layer_normalization_axis01_2"
#define SOURCE_AXIS01_3 "layer_normalization_axis01_3"
#define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, SCALE_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.layer_norm_axis0_"#SRC0_TYPE"_"#SCALE_TYPE"to"#DST_TYPE)
@ -88,15 +93,15 @@ __BEGIN_DECLS
#define HASH_LN_AXIS01_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.layernorm_axis01_"#SRC0_TYPE"_"#SRC1_TYPE"to"#DST_TYPE)
#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE) \
#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_LAYERNORM_KEY(IN0_TYPE, U4, OUT_TYPE, 0), \
HASH_LN_AXIS01_SUMS_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE_AXIS01 },
SOURCE },
#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
{ HASH_LAYERNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
HASH_LN_AXIS01_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
SOURCE_AXIS01 },
SOURCE },
typedef struct
{
@ -159,32 +164,32 @@ static const _kernel_map_type _layernorm_kernel_map[] =
static const _kernel_map_type _layernorm_axis01_kernel_map[] =
{
// Register kernel here
LN_AXIS01_SUMS_KERNELS( I8, F32 )
LN_AXIS01_SUMS_KERNELS( U8, F32 )
LN_AXIS01_SUMS_KERNELS( F16, F32 )
LN_AXIS01_SUMS_KERNELS( I16, F32 )
LN_AXIS01_SUMS_KERNELS( I8, F32, SOURCE_AXIS01_SUM )
LN_AXIS01_SUMS_KERNELS( U8, F32, SOURCE_AXIS01_SUM )
LN_AXIS01_SUMS_KERNELS( F16, F32, SOURCE_AXIS01_SUM )
LN_AXIS01_SUMS_KERNELS( I16, F32, SOURCE_AXIS01_SUM )
LAYERNORM_AXIS01_KERNELS( U8, F16, U8 )
LAYERNORM_AXIS01_KERNELS( U8, F16, F16 )
LAYERNORM_AXIS01_KERNELS( I8, F16, I8 )
LAYERNORM_AXIS01_KERNELS( I8, F16, F16 )
LAYERNORM_AXIS01_KERNELS( F16, F16, F16 )
LAYERNORM_AXIS01_KERNELS( F16, F16, I16 )
LAYERNORM_AXIS01_KERNELS( F16, F16, I8 )
LAYERNORM_AXIS01_KERNELS( F16, F16, U8 )
LAYERNORM_AXIS01_KERNELS( I16, F16, I16 )
LAYERNORM_AXIS01_KERNELS( I16, F16, F16 )
LAYERNORM_AXIS01_KERNELS( U8, F16, U8, SOURCE_AXIS01_0 )
LAYERNORM_AXIS01_KERNELS( U8, F16, F16, SOURCE_AXIS01_0 )
LAYERNORM_AXIS01_KERNELS( I8, F16, I8, SOURCE_AXIS01_1 )
LAYERNORM_AXIS01_KERNELS( I8, F16, F16, SOURCE_AXIS01_1 )
LAYERNORM_AXIS01_KERNELS( F16, F16, F16, SOURCE_AXIS01_2 )
LAYERNORM_AXIS01_KERNELS( F16, F16, I16, SOURCE_AXIS01_2 )
LAYERNORM_AXIS01_KERNELS( F16, F16, I8, SOURCE_AXIS01_2 )
LAYERNORM_AXIS01_KERNELS( F16, F16, U8, SOURCE_AXIS01_2 )
LAYERNORM_AXIS01_KERNELS( I16, F16, I16, SOURCE_AXIS01_3 )
LAYERNORM_AXIS01_KERNELS( I16, F16, F16, SOURCE_AXIS01_3 )
LAYERNORM_AXIS01_KERNELS( U8, F32, U8 )
LAYERNORM_AXIS01_KERNELS( U8, F32, F16 )
LAYERNORM_AXIS01_KERNELS( I8, F32, I8 )
LAYERNORM_AXIS01_KERNELS( I8, F32, F16 )
LAYERNORM_AXIS01_KERNELS( F16, F32, F16 )
LAYERNORM_AXIS01_KERNELS( F16, F32, I16 )
LAYERNORM_AXIS01_KERNELS( F16, F32, I8 )
LAYERNORM_AXIS01_KERNELS( F16, F32, U8 )
LAYERNORM_AXIS01_KERNELS( I16, F32, I16 )
LAYERNORM_AXIS01_KERNELS( I16, F32, F16 )
LAYERNORM_AXIS01_KERNELS( U8, F32, U8, SOURCE_AXIS01_0 )
LAYERNORM_AXIS01_KERNELS( U8, F32, F16, SOURCE_AXIS01_0 )
LAYERNORM_AXIS01_KERNELS( I8, F32, I8, SOURCE_AXIS01_1 )
LAYERNORM_AXIS01_KERNELS( I8, F32, F16, SOURCE_AXIS01_1 )
LAYERNORM_AXIS01_KERNELS( F16, F32, F16, SOURCE_AXIS01_2 )
LAYERNORM_AXIS01_KERNELS( F16, F32, I16, SOURCE_AXIS01_2 )
LAYERNORM_AXIS01_KERNELS( F16, F32, I8, SOURCE_AXIS01_2 )
LAYERNORM_AXIS01_KERNELS( F16, F32, U8, SOURCE_AXIS01_2 )
LAYERNORM_AXIS01_KERNELS( I16, F32, I16, SOURCE_AXIS01_3 )
LAYERNORM_AXIS01_KERNELS( I16, F32, F16, SOURCE_AXIS01_3 )
};
@ -1165,3 +1170,4 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_EVIS( layer_norm, _setup )
#endif

View File

@ -22,6 +22,7 @@
*
*****************************************************************************/
#if !(VX_LOGSOFTMAX_VX_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -34,15 +35,21 @@
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
#define HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
#define HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) \
"log_softmax_axis"#_suffix
#define HASH_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_suffix) \
"log_softmax_exceed_axis"#_suffix
#define HASH_LOG_SOFTMAX_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, _suffix) \
{ HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \
@ -53,11 +60,18 @@ __BEGIN_DECLS
CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \
HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) },
static const struct {
#define HASH_LOG_SOFTMAX_EXCEED_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, _suffix) \
{ HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
CVIVANTE_NAMESPACE("evis.log_softmax_exceed_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \
HASH_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_suffix) },
typedef struct {
uint32_t key;
char* function_name;
const char* source_name;
} _log_softmax_evis_kernel_map[] =
} _kernel_map_type;
static const _kernel_map_type _log_softmax_evis_kernel_map[] =
{
HASH_LOG_SOFTMAX_KERNELS(0, F16, F16, 0)
HASH_LOG_SOFTMAX_KERNELS(0, F16, I16, 0)
@ -126,6 +140,49 @@ static const struct {
};
static const _kernel_map_type _log_softmax_exceed_evis_kernel_map[] =
{
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, F16, 0)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, I16, 0)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, U8, 0)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, I8, 0)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I16, I16, 0)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I16, F16, 0)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, BF16, 0_BF16)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, F32, 0_BF16)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, F16, 0_BF16)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, U8, U8, 0)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, U8, F16, 0)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I8, I8, 0)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I8, F16, 0)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, F16, 1)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, I16, 1)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, U8, 1)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, I8, 1)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I16, I16, 1)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I16, F16, 1)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, BF16, 1_BF16)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, F32, 1_BF16)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, F16, 1_BF16)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, U8, U8, 1)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, U8, F16, 1)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I8, I8, 1)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I8, F16, 1)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, F16, 2)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, I16, 2)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, U8, 2)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, I8, 2)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I16, I16, 2)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I16, F16, 2)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, BF16, BF16, 2)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, U8, U8, 2)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, U8, F16, 2)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I8, I8, 2)
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I8, F16, 2)
};
static vx_param_description_t kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@ -133,7 +190,9 @@ static vx_param_description_t kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def)
#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def)
#define SCALAR_INPUT_AXIS (2)
#define SCALAR_INPUT_BETA (3)
@ -157,7 +216,7 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
float beta = 0;
float input_scale = 0;
float output_scale = 0;
int32_t outputZP = 0;
float outputZP = 0;
uint32_t inputWidth = 0;
uint32_t inputWidthRemain4 = 0;
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL };
@ -385,62 +444,25 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
}
}
outputZP = (float)attr[1]->zero_point;
output_scale = 1.0f / (float)(attr[1]->scale);
if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[1]->dfp.fl;
if (fl > 0)
{
output_scale = (float)((int64_t)1 << fl);
}
else
{
output_scale = (float)1.0f / (float) ((int64_t)1 << -fl);
}
status = vsi_nn_kernel_gpu_add_param( node,
"outputScale", &output_scale );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
float output_offset_asymmetric = 0;
outputZP = attr[1]->asymm.zero_point;
output_scale = 1.0f / (float)(attr[1]->asymm.scale);
output_offset_asymmetric = (float)outputZP;
status = vsi_nn_kernel_gpu_add_param( node,
"outputScale", &output_scale );
status |= vsi_nn_kernel_gpu_add_param( node,
"output_offset_asymmetric", &output_offset_asymmetric );
"output_offset_asymmetric", &outputZP );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
output_scale = 1;
outputZP = 0;
}
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[0]->dfp.fl;
if (fl > 0)
{
input_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input_scale = (float)((int64_t)1 << -fl);
}
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input_scale = attr[0]->asymm.scale;
}
else
{
input_scale = 1.0f;
}
input_scale = attr[0]->scale;
scaleLogE = scaleLogE * input_scale;
beta = beta * input_scale;
@ -471,6 +493,296 @@ final:
return status;
} /* _log_softmax_initializer() */
DEF_KERNEL_INITIALIZER(_log_softmax_exceed_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
int32_t axis = 0;
float beta = 0;
float input_scale = 0;
float output_scale = 0;
float outputZP = 0;
uint32_t inputWidth = 0;
uint32_t inputWidthRemain4 = 0;
int32_t width = 0;
int32_t height = 0;
int32_t depth = 0;
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL };
vsi_size_array_t * output_shape = NULL;
float logE = (float)(log10(exp(1.0f)) / log10(2.0f));
float rlogE = (float)(log10(2.0f) / log10(exp(1.0f)));
float scaleLogE = 0;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &beta);
CHECK_STATUS_FAIL_GOTO(status, final );
scaleLogE = logE * beta;
output_shape = attr[1]->shape;
width = (int32_t)output_shape->data[0];
height = (int32_t)output_shape->data[1];
depth = output_shape->size > 2 ? (int32_t)output_shape->data[2] : 1;
gpu_param.dim = 2;
switch (axis)
{
case 0:
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_size[0] = 1;
gpu_param.global_size[1] = depth;
break;
case 1:
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
gpu_param.global_size[0] =
gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = 1;
break;
default:
break;
}
{
gpu_dp_inst_t uniGetSubData0to3_4x4 = {{
0x09090909, // TCfg
0x04040404, // ASelt
0x00010000, 0x00030002, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniGetSubData4to7_4x4 = {{
0x09090909, // TCfg
0x04040404, // ASelt
0x00050004, 0x00070006, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackMaxData_2x8 = {{
0x00000111, // TCfg
0x00000000, // ASelt
0x00050300, 0x00000000, // ABin
0x00000222, // BSelt
0x00000000, 0x00000000, // BBin
0x00004400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractHalf4_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniGetSubLoData_4x4 = {{
0x09090909, // TCfg
0x04040404, // ASelt
0x00110000, 0x00330022, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniGetSubHiData_4x4 = {{
0x09090909, // TCfg
0x04040404, // ASelt
0x00550044, 0x00770066, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
switch( axis )
{
case 0:
{
inputWidth = (uint32_t)(output_shape->data[axis] / 4 * 4);
inputWidthRemain4 = (uint32_t)(output_shape->data[axis] % 4);
status = vsi_nn_kernel_gpu_add_param( node,
"inputWidth", &inputWidth );
status |= vsi_nn_kernel_gpu_add_param( node,
"inputWidthRemain4", &inputWidthRemain4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniPackMaxData_2x8", &uniPackMaxData_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "axisSize", &width );
status |= vsi_nn_kernel_gpu_add_param( node, "height", &height);
if (attr[0]->dtype == BF16)
{
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractHalf4_4x4", &uniExtractHalf4_4x4 );
}
else
{
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniGetSubData0to3_4x4", &uniGetSubData0to3_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniGetSubData4to7_4x4", &uniGetSubData4to7_4x4 );
}
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case 1:
{
if (attr[0]->dtype == BF16)
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
}
else
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniGetSubLoData_4x4", &uniGetSubLoData_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniGetSubHiData_4x4", &uniGetSubHiData_4x4 );
}
status |= vsi_nn_kernel_gpu_add_param( node, "axisSize", &height );
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
default:
break;
}
}
outputZP = (float)attr[1]->zero_point;
output_scale = 1.0f / attr[1]->scale;
if (attr[0]->dtype != BF16)
{
status = vsi_nn_kernel_gpu_add_param( node,
"outputScale", &output_scale );
status |= vsi_nn_kernel_gpu_add_param( node,
"output_offset_asymmetric", &outputZP );
CHECK_STATUS_FAIL_GOTO(status, final );
}
input_scale = attr[0]->scale;
scaleLogE = scaleLogE * input_scale;
beta = beta * input_scale;
status |= vsi_nn_kernel_gpu_add_param( node,
"rlogE", &rlogE );
status |= vsi_nn_kernel_gpu_add_param( node,
"betaValue", &beta );
status |= vsi_nn_kernel_gpu_add_param( node,
"scaleLogE", &scaleLogE );
status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final );
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
}
return status;
}
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
@ -513,7 +825,51 @@ static vsi_status _query_kernel
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
static vsi_status _query_kernel_exceed
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
int32_t axis,
vsi_nn_kernel_t* kernel
)
{
vsi_nn_kernel_dtype_e input_dtype;
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
size_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_LOG_SOFTMAX_HASH_KEY( axis, input_dtype, output_dtype, 0 );
for( i = 0; i < _cnt_of_array(_log_softmax_exceed_evis_kernel_map); i ++ )
{
if( _log_softmax_exceed_evis_kernel_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(_log_softmax_exceed_evis_kernel_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _log_softmax_exceed_evis_kernel_map[i].function_name );
kernel->info.parameters = kernel_param_def;
kernel->info.numParams = _cnt_of_array( kernel_param_def );
kernel->info.initialize = _log_softmax_exceed_initializer;
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
_log_softmax_exceed_evis_kernel_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
_log_softmax_exceed_evis_kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
}
static vsi_nn_kernel_node_t _setup_not_exceed
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
@ -528,7 +884,13 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL};
vsi_bool image_2d = FALSE;
vsi_nn_kernel_node_t node = NULL;
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
uint32_t rank_in = 0;
int32_t axis = 0;
int32_t new_axis = 0;
vsi_bool ret = vx_false_e;
uint32_t i = 0;
float beta = 1.0f;
VSI_UNREFERENCED(input_num);
@ -537,15 +899,31 @@ static vsi_nn_kernel_node_t _setup
axis = vsi_nn_kernel_param_get_int32(params, "axis");
beta = vsi_nn_kernel_param_get_float32(params, "beta");
if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
inputs[0]->attr.dim_num )
|| axis > 2)
ret = vsi_nn_kernel_optimize_softmax_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
shapes[0], &rank_in, &new_axis);
if (ret)
{
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], rank_in );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[0], rank_in );
}
else
{
return NULL;
}
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
reshape_tensors[0]->attr.dim_num )
|| new_axis > 2)
{
return NULL;
}
image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1);
status = _query_kernel( inputs, outputs, new_axis, image_2d, kernel );
if( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
@ -553,9 +931,9 @@ static vsi_nn_kernel_node_t _setup
{
/* Pass parameters to node. */
vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM,
inputs, 1, outputs, 1 );
reshape_tensors, 1, &reshape_tensors[1], 1 );
node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
graph, I32, &axis );
graph, I32, &new_axis );
node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create(
graph, F32, &beta );
@ -565,10 +943,132 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_BETA] );
}
}
for (i = 0; i < 2; i++)
{
vsi_safe_release_tensor(reshape_tensors[i]);
}
return node;
} /* _setup() */
static vsi_nn_kernel_node_t _setup_exceed
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
uint32_t rank_in = 0;
int32_t axis = 0;
int32_t new_axis = 0;
vsi_bool ret = vx_false_e;
uint32_t i = 0;
float beta = 1.0f;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
axis = vsi_nn_kernel_param_get_int32(params, "axis");
beta = vsi_nn_kernel_param_get_float32(params, "beta");
ret = vsi_nn_kernel_optimize_softmax_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
shapes[0], &rank_in, &new_axis);
if (ret)
{
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], rank_in );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[0], rank_in );
}
else
{
return NULL;
}
if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
reshape_tensors[0]->attr.dim_num )
|| new_axis > 1)
{
return NULL;
}
status = _query_kernel_exceed(inputs, outputs, new_axis, kernel);
if( VSI_SUCCESS != status)
{
goto final;
}
node = vsi_nn_kernel_create_node( graph, kernel );
CHECK_PTR_FAIL_GOTO( node, "Create kernel fail.", final );
if (node)
{
vsi_nn_kernel_node_pack_io(node_params, _EVIS_PARAM_NUM,
reshape_tensors,
input_num,
&reshape_tensors[1],
output_num);
node_params[2] = vsi_nn_kernel_scalar_create(graph, I32, &new_axis );
node_params[3] = vsi_nn_kernel_scalar_create(graph, F32, &beta );
status = vsi_nn_kernel_node_pass_param(
node, node_params, _EVIS_PARAM_NUM);
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &node_params[2] );
vsi_nn_kernel_scalar_release( &node_params[3] );
}
final:
for (i = 0; i < 2; i++)
{
vsi_safe_release_tensor(reshape_tensors[i]);
}
return node;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_nn_kernel_node_t node = NULL;
vsi_size_t *input_size = inputs[0]->attr.size;
int32_t axis = 0;
axis = vsi_nn_kernel_param_get_int32(params, "axis");
if (input_size[axis] >= GPU_TENSOR_MAX_WIDTH)
{
node = _setup_exceed(graph, inputs, input_num, outputs, output_num, params, kernel);
}
else
{
node = _setup_not_exceed(graph, inputs, input_num, outputs, output_num, params, kernel);
}
return node;
}
__END_DECLS
REGISTER_BACKEND_EVIS( log_softmax, _setup )
#endif

View File

@ -996,18 +996,14 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
float forget_bias = 0.0f;
float outputScale = 1.0f;
float outputZP = 0;
int32_t dstZP = 0;
float dstScale = 1.0f;
vsi_nn_kernel_dtype_e cellFormat = F16;
vsi_nn_kernel_dtype_e dstFormat = F16;
vsi_nn_kernel_quant_type_e dstQuantType = VSI_NN_KERNEL_QUANT_NONE;
int32_t dstFixPointPos = 0;
float logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f));
float logE = (float)(log10(exp(1.0f)) / log10(2.0f));
float twoLogE = 2 * logE;
uint32_t uint_min = 0xFBFFFFFF;
uint32_t uint_max = 0x7BFFFFFF;
float float_min = *(vx_float32 *)&uint_min;
float float_max = *(vx_float32 *)&uint_max;
float float_min = *(float *)&uint_min;
float float_max = *(float *)&uint_max;
float clip_Min_F[4] = {0};
float clip_Max_F[4] = {0};
uint32_t i = 0;
@ -1063,22 +1059,11 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
status = vsi_nn_kernel_scalar_read_float32( (vsi_nn_kernel_scalar_t)param[param_size - 1], &forget_bias );
CHECK_STATUS_FAIL_GOTO(status, final );
cellFormat = attr[0]->dtype;
dstFormat = attr[1]->dtype;
cellFormat = attr[0]->dtype;
dstFormat = attr[1]->dtype;
dstQuantType = attr[1]->quant;
if ( VSI_NN_KERNEL_QUANT_DFP == dstQuantType )
{
dstFixPointPos = (int8_t)attr[1]->dfp.fl;
}
else if ( VSI_NN_KERNEL_QUANT_ASYMM == dstQuantType )
{
dstZP = attr[1]->asymm.zero_point;
dstScale = attr[1]->asymm.scale;
}
outputZP = (vx_float32)dstZP;
outputScale = 1.0f / attr[1]->scale;
outputZP = (float)attr[1]->zero_point;
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
@ -1182,20 +1167,6 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
if (dstQuantType == VSI_NN_KERNEL_QUANT_DFP)
{
if (dstFixPointPos >= 0)
outputScale *= (vx_float32)((int64_t)1 << dstFixPointPos);
else if (dstFixPointPos < 0)
outputScale *= 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
outputZP = 0;
}
else if (dstQuantType == VSI_NN_KERNEL_QUANT_ASYMM)
{
outputScale = 1.0f / dstScale;
}
if ( cellFormat == F16 )
{
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4);

View File

@ -288,67 +288,13 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &K);
CHECK_STATUS_FAIL_GOTO(status, OnError );
src0ZP = attr[0]->asymm.zero_point;
src0Scale = attr[0]->asymm.scale;
src1ZP = attr[1]->asymm.zero_point;
src1Scale = attr[1]->asymm.scale;
dstZP = (float)attr[2]->asymm.zero_point;
dstScale = attr[2]->asymm.scale;
src0ZP = attr[0]->zero_point;
src0Scale = attr[0]->scale;
src1ZP = attr[1]->zero_point;
src1Scale = attr[1]->scale;
dstZP = (float)attr[2]->zero_point;
dstScale = attr[2]->scale;
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
src0ZP = 0;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
src0Scale = 1;
src0ZP = 0;
}
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[1]->dfp.fl > 0)
{
src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
}
else
{
src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
}
src1ZP = 0;
}
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
src1Scale = 1;
src1ZP = 0;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[2]->dfp.fl > 0)
{
dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
}
else
{
dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
}
dstScale = 1.0f / dstScale;
dstZP = 0.0f;
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
dstScale = 1;
dstZP = 0.0f;
}
gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postShift0);
gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postShift1);
@ -1266,67 +1212,12 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_cross_initializer)
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &axis_size);
CHECK_STATUS_FAIL_GOTO(status, OnError );
src0ZP = attr[0]->asymm.zero_point;
src0Scale = attr[0]->asymm.scale;
src1ZP = attr[1]->asymm.zero_point;
src1Scale = attr[1]->asymm.scale;
dstZP = (float)attr[2]->asymm.zero_point;
dstScale = attr[2]->asymm.scale;
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
src0ZP = 0;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
src0Scale = 1;
src0ZP = 0;
}
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[1]->dfp.fl > 0)
{
src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
}
else
{
src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
}
src1ZP = 0;
}
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
src1Scale = 1;
src1ZP = 0;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[2]->dfp.fl > 0)
{
dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
}
else
{
dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
}
dstScale = 1.0f / dstScale;
dstZP = 0.0f;
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
dstScale = 1;
dstZP = 0.0f;
}
src0ZP = attr[0]->zero_point;
src0Scale = attr[0]->scale;
src1ZP = attr[1]->zero_point;
src1Scale = attr[1]->scale;
dstZP = (float)attr[2]->zero_point;
dstScale = attr[2]->scale;
mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP);
inOutScale = src0Scale * src1Scale / dstScale;

View File

@ -163,63 +163,12 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
out_shape = attr[2]->shape;
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[0]->dfp.fl;
if (fl > 0)
{
input0_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input0_scale = (float)((int64_t)1 << -fl);
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
input0_zp = attr[0]->asymm.zero_point;
input0_scale = attr[0]->asymm.scale;
}
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[1]->dfp.fl;
if (fl > 0)
{
input1_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input1_scale = (float)((int64_t)1 << -fl);
}
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
input1_zp = attr[1]->asymm.zero_point;
input1_scale = attr[1]->asymm.scale;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[2]->dfp.fl;
if (fl > 0)
{
output_scale = (float) ((int64_t)1 << fl);
}
else
{
output_scale = 1.0f / (float)((int64_t)1 << -fl);
}
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
output_zp = attr[2]->asymm.zero_point;
output_scale = 1.0f / attr[2]->asymm.scale;
}
input0_zp = attr[0]->zero_point;
input0_scale = attr[0]->scale;
input1_zp = attr[1]->zero_point;
input1_scale = attr[1]->scale;
output_zp = attr[2]->zero_point;
output_scale = 1.0f / attr[2]->scale;
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
(IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
@ -454,30 +403,52 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_tensor_t* tmp_inputs[2] = { NULL };
vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
vsi_size_t new_rank = 0;
vsi_bool ret = TRUE;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(params);
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
ret = vsi_nn_kernel_optimize_eltwise_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
inputs[1]->attr.size, inputs[1]->attr.dim_num,
outputs[0]->attr.size, outputs[0]->attr.dim_num,
shapes[0], shapes[1], shapes[2], &new_rank );
if (ret == FALSE)
{
return NULL;
goto final;
}
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], new_rank );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
inputs[1], shapes[1], new_rank );
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[2], new_rank );
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
reshape_tensors[2]->attr.dim_num ) )
{
goto final;
}
// Reorder tensor
if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 )
{
int32_t order[2] = {1, 0};
vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs );
vsi_nn_reorder_tensor( reshape_tensors, order, 2, tmp_inputs );
}
else
{
memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 );
memmove( tmp_inputs, reshape_tensors, sizeof(vsi_nn_tensor_t*) * 2 );
}
image_2d = (outputs[0]->attr.dim_num == 2);
status = _query_kernel( tmp_inputs, outputs, image_2d, kernel );
image_2d = (reshape_tensors[2]->attr.dim_num == 2);
status = _query_kernel( tmp_inputs, &reshape_tensors[2], image_2d, kernel );
if ( VSI_SUCCESS == status )
{
node = vsi_nn_kernel_create_node( graph, kernel );
@ -485,10 +456,16 @@ static vsi_nn_kernel_node_t _setup
{
/* Pass parameters to node. */
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
tmp_inputs, 2, outputs, 1 );
tmp_inputs, 2, &reshape_tensors[2], 1 );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
}
}
final:
vsi_safe_release_tensor(reshape_tensors[0]);
vsi_safe_release_tensor(reshape_tensors[1]);
vsi_safe_release_tensor(reshape_tensors[2]);
return node;
} /* _setup() */

View File

@ -163,63 +163,12 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
out_shape = attr[2]->shape;
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[0]->dfp.fl;
if (fl > 0)
{
input0_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input0_scale = (float)((int64_t)1 << -fl);
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
input0_zp = attr[0]->asymm.zero_point;
input0_scale = attr[0]->asymm.scale;
}
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[1]->dfp.fl;
if (fl > 0)
{
input1_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input1_scale = (float)((int64_t)1 << -fl);
}
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
input1_zp = attr[1]->asymm.zero_point;
input1_scale = attr[1]->asymm.scale;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[2]->dfp.fl;
if (fl > 0)
{
output_scale = (float) ((int64_t)1 << fl);
}
else
{
output_scale = 1.0f / (float)((int64_t)1 << -fl);
}
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
output_zp = attr[2]->asymm.zero_point;
output_scale = 1.0f / attr[2]->asymm.scale;
}
input0_zp = attr[0]->zero_point;
input0_scale = attr[0]->scale;
input1_zp = attr[1]->zero_point;
input1_scale = attr[1]->scale;
output_zp = attr[2]->zero_point;
output_scale = 1.0f / attr[2]->scale;
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
(IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
@ -454,30 +403,52 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_tensor_t* tmp_inputs[2] = { NULL };
vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
vsi_size_t new_rank = 0;
vsi_bool ret = TRUE;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(params);
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
ret = vsi_nn_kernel_optimize_eltwise_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
inputs[1]->attr.size, inputs[1]->attr.dim_num,
outputs[0]->attr.size, outputs[0]->attr.dim_num,
shapes[0], shapes[1], shapes[2], &new_rank );
if (ret == FALSE)
{
return NULL;
goto final;
}
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], new_rank );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
inputs[1], shapes[1], new_rank );
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[2], new_rank );
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
reshape_tensors[2]->attr.dim_num ) )
{
goto final;
}
// Reorder tensor
if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 )
{
int32_t order[2] = {1, 0};
vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs );
vsi_nn_reorder_tensor( reshape_tensors, order, 2, tmp_inputs );
}
else
{
memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 );
memmove( tmp_inputs, reshape_tensors, sizeof(vsi_nn_tensor_t*) * 2 );
}
image_2d = (outputs[0]->attr.dim_num == 2);
status = _query_kernel( tmp_inputs, outputs, image_2d, kernel );
image_2d = (reshape_tensors[2]->attr.dim_num == 2);
status = _query_kernel( tmp_inputs, &reshape_tensors[2], image_2d, kernel );
if ( VSI_SUCCESS == status )
{
node = vsi_nn_kernel_create_node( graph, kernel );
@ -485,10 +456,16 @@ static vsi_nn_kernel_node_t _setup
{
/* Pass parameters to node. */
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
tmp_inputs, 2, outputs, 1 );
tmp_inputs, 2, &reshape_tensors[2], 1 );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
}
}
final:
vsi_safe_release_tensor(reshape_tensors[0]);
vsi_safe_release_tensor(reshape_tensors[1]);
vsi_safe_release_tensor(reshape_tensors[2]);
return node;
} /* _setup() */

View File

@ -128,9 +128,6 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
vsi_nn_kernel_dtype_e input0_dtype = F16;
int32_t input0_fl = 0;
int32_t input1_fl = 0;
int32_t output_fl = 0;
float inScale0 = 1.0f;
float inScale1 = 1.0f;
float outScale = 1.0f;
@ -168,59 +165,12 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
(output_shape->data[2] + gpu_param.global_scale[2] - 1)
/ gpu_param.global_scale[2] : 1;
if (input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
{
input0_fl = input0_attr->dfp.fl;
if (input0_fl > 0)
{
inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
}
else
{
inScale0 = (float)((int64_t)1 << -input0_fl);
}
}
else if (input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
inScale0 = input0_attr->asymm.scale;
in0Tail = -inScale0 * ((float)input0_attr->asymm.zero_point);
}
if (input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
{
input1_fl = input1_attr->dfp.fl;
if (input1_fl > 0)
{
inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
}
else
{
inScale1 = (float)((int64_t)1 << -input1_fl);
}
}
else if (input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
inScale1 = input1_attr->asymm.scale;
in1Tail = -inScale1 * ((float)input1_attr->asymm.zero_point);
}
if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
{
output_fl = output_attr->dfp.fl;
if (output_fl > 0)
{
outScale = (float) ((int64_t)1 << output_fl);
}
else
{
outScale = 1.0f / (float)((int64_t)1 << -output_fl);
}
}
else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
outScale = 1.0f / output_attr->asymm.scale;
outZp = (float)(output_attr->asymm.zero_point);
}
inScale0 = input0_attr->scale;
in0Tail = 0 - inScale0 * ((float)input0_attr->zero_point);
inScale1 = input1_attr->scale;
in1Tail = 0 - inScale1 * ((float)input1_attr->zero_point);
outScale = 1.0f / output_attr->scale;
outZp = (float)(output_attr->zero_point);
if (BF16 == input0_dtype)
{

View File

@ -239,76 +239,12 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
input_shape = attr[0]->shape;
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
input_zp = attr[0]->asymm.zero_point;
scaleIn = attr[0]->asymm.scale;
}
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
input_zp = 0;
}
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
input_zp = 0;
scaleIn = 1;
}
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
output_ZP0 = (float)attr[1]->asymm.zero_point;
outputScale0 = 1.0f / attr[1]->asymm.scale;
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[1]->dfp.fl > 0)
{
outputScale0 = (float)((int64_t)1 << attr[1]->dfp.fl);
}
else
{
outputScale0 = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
}
output_ZP0 = 0.0f;
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
outputScale0 = 1.0f;
output_ZP0 = 0.0f;
}
if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
output_ZP1 = (float)attr[2]->asymm.zero_point;
outputScale1 = 1.0f / attr[2]->asymm.scale;
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[2]->dfp.fl > 0)
{
outputScale1 = (float)((int64_t)1 << attr[2]->dfp.fl);
}
else
{
outputScale1 = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
}
output_ZP1 = 0.0f;
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
outputScale1 = 1.0f;
output_ZP1 = 0.0f;
}
input_zp = attr[0]->zero_point;
scaleIn = attr[0]->scale;
output_ZP0 = (float)attr[1]->zero_point;
outputScale0 = 1.0f / attr[1]->scale;
output_ZP1 = (float)attr[2]->zero_point;
outputScale1 = 1.0f / attr[2]->scale;
output_ZP[0] = output_ZP0;
output_ZP[1] = output_ZP1;

View File

@ -160,16 +160,13 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
in_shape = attr[0]->shape;
depth = (int32_t)(attr[1]->shape->data[1]);
input_dtype = attr[0]->dtype;
input_zp = attr[0]->zero_point;
scaleIn = attr[0]->scale;
if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant)
{
srcFixPointPos = attr[0]->dfp.fl;
}
else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant)
{
input_zp = attr[0]->asymm.zero_point;
scaleIn = attr[0]->asymm.scale;
}
if (suffix_size == 1)
{

View File

@ -155,41 +155,19 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer)
input_shape = input_attr->shape;
src_dtype = input_attr->dtype;
dst_dtype = output_attr->dtype;
inputScale = input_attr->scale;
input_ZP = input_attr->zero_point;
outputScale = output_attr->scale;
output_ZP = output_attr->zero_point;
if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
input_fl = input_attr->dfp.fl;
if (input_fl > 0)
{
inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
}
else
{
inputScale = (float)((int64_t)1 << -input_fl);
}
}
else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
inputScale = input_attr->asymm.scale;
input_ZP = input_attr->asymm.zero_point;
}
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
output_fl = output_attr->dfp.fl;
if (output_fl > 0)
{
outputScale = 1.0f / (float) ((int64_t)1 << output_fl);
}
else
{
outputScale = (float)((int64_t)1 << -output_fl);
}
}
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
outputScale = output_attr->asymm.scale;
output_ZP = output_attr->asymm.zero_point;
}
if ( ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )

View File

@ -22,6 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_POW_API_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -158,64 +159,13 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
out_shape = attr[2]->shape;
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[0]->dfp.fl;
if (fl > 0)
{
input0_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input0_scale = (float)((int64_t)1 << -fl);
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
input0_scale = attr[0]->asymm.scale;
input0_tail = 0 - (float)attr[0]->asymm.zero_point * input0_scale;
}
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[1]->dfp.fl;
if (fl > 0)
{
input1_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input1_scale = (float)((int64_t)1 << -fl);
}
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
input1_scale = attr[1]->asymm.scale;
input1_tail = 0 - (float)attr[1]->asymm.zero_point * input1_scale;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[2]->dfp.fl;
if (fl > 0)
{
output_scale = (float) ((int64_t)1 << fl);
}
else
{
output_scale = 1.0f / (float)((int64_t)1 << -fl);
}
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
output_zp = (float)attr[2]->asymm.zero_point;
output_scale = 1.0f / attr[2]->asymm.scale;
}
out_shape = attr[2]->shape;
input0_scale = attr[0]->scale;
input0_tail = 0 - (float)attr[0]->zero_point * input0_scale;
input1_scale = attr[1]->scale;
input1_tail = 0 - (float)attr[1]->zero_point * input1_scale;
output_zp = (float)attr[2]->zero_point;
output_scale = 1.0f / attr[2]->scale;
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
(IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
@ -454,3 +404,4 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_EVIS( pow, _setup )
#endif

View File

@ -140,28 +140,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
}
enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
}
else
{
outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
}
dstZP = 0;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
outputScale = 1.0f / attr[0]->asymm.scale;
dstZP = attr[0]->asymm.zero_point;
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
outputScale = 1;
dstZP = 0;
}
outputScale = 1.0f / attr[0]->scale;
dstZP = attr[0]->zero_point;
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;

View File

@ -133,28 +133,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
width = (uint32_t)(out_shape->data[0]);
height = (uint32_t)(out_shape->data[1]);
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
}
else
{
outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
}
dstZP = 0.0f;
}
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
outputScale = 1.0f / attr[0]->asymm.scale;
dstZP = (float)attr[0]->asymm.zero_point;
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
outputScale = 1;
dstZP = 0.0f;
}
outputScale = 1.0f / attr[0]->scale;
dstZP = (float)attr[0]->zero_point;
shaderParam.global_scale[0] = 16;
shaderParam.global_scale[1] = 1;
@ -232,33 +212,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_initializer)
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
out_shape = attr[0]->shape;
dstZP = (float)attr[0]->asymm.zero_point;
outputScale = attr[0]->asymm.scale;
dstZP = (float)attr[0]->zero_point;
outputScale = 1.0f / attr[0]->scale;
width = (uint32_t)(out_shape->data[0]);
height = (uint32_t)(out_shape->data[1]);
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
}
else
{
outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
}
dstZP = 0.0f;
}
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
outputScale = 1.0f/outputScale;
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
outputScale = 1;
dstZP = 0.0f;
}
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
@ -499,8 +457,8 @@ OnError:
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;

View File

@ -0,0 +1,884 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
__BEGIN_DECLS
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOF16 \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toF16")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI16 \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI16")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOU8 \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI8 \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI8")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOU8 \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOI8 \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toI8")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOI16 \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toI16")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOF16 \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toF16")
// greater than a quarter
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOU8_GQ \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8_gq")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI8_GQ \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8_gq")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOF16_GQ \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toF16_gq")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI16_GQ \
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI16_gq")
#define KERNEL_SOURCE_1 "pre_process_nv12_rggb_copy",
#define KERNEL_SOURCE_2 "pre_process_nv12_rggb_scale",
typedef enum
{
COPY = 0,
SCALE,
TRANS
} vsi_nn_kernel_convert_type_e;
#define HASH_PRE_PROCESS_NV12_RGGB_KEY(_input0_type, _output_type, _convert_type, _greater_quarter) \
((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8) | (_greater_quarter))
#define TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \
{ HASH_PRE_PROCESS_NV12_RGGB_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 0), \
VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \
{ HASH_PRE_PROCESS_NV12_RGGB_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 1), \
VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE##_GQ, \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
const char* source_name;
} pre_process_nv12_rggb_map[] =
{
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I8, COPY, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I16, COPY, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, F16, COPY, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2)
};
static vx_param_description_t vxPreProcessNv12_RGGBKernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM _cnt_of_array(vxPreProcessNv12_RGGBKernel_param_def)
static vsi_bool _check_nv12_type_from_env()
{
vsi_bool ret = FALSE;
char* env_s = vsi_nn_getenv("VSI_NN_ENABLE_OCV_NV12");
if (env_s)
{
ret = TRUE;
}
return ret;
}
DEF_KERNEL_INITIALIZER(_pre_process_nv12_rggb_copy_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t shaderParam = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
float output_zp = 0;
float output_scale = 1;
int32_t reorder = 0;
int32_t order1 = 3;
uint32_t width = 0;
uint32_t height = 0;
int32_t nv_type = 0;
float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
vsi_bool ocv_nv12 = _check_nv12_type_from_env();
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &nv_type);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
output_scale = 1.0f / attr[0]->scale;
output_zp = (float)attr[0]->zero_point;
width = (uint32_t)(out_shape->data[0]);
height = (uint32_t)(out_shape->data[1]);
if (reorder != 0)
{
reorder = 3;
order1 = 0;
}
if (nv_type == VSI_NN_YUV_TYPE_NV21_BGGR)
{
int32_t tmporder = reorder;
reorder = order1;
order1 = tmporder;
}
outputScaleVar_b = output_scale * b_scale;
outputScaleVar_g = output_scale * g_scale;
outputScaleVar_r = output_scale * r_scale;
bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
/ shaderParam.global_scale[1], 2);
shaderParam.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
{
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertNV12toB_4x4 = {{
0x05050505, // TCfg
0x04040404, // ASelt
0x00210000, 0x00630042, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000,
0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertNV12toG_4x4 = {{
0x29292929, // TCfg
0x14141414, // ASelt
0x03210100, 0x07630542, // ABin
0x2a2a2a2a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertNV12toR_4x4 = {{
0x05050505, // TCfg
0x04040404, // ASelt
0x00310010, 0x00730052, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractYtoShortSub16_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
0x99999999, // TCfg
0x44444444, // ASelt
0x01000100, 0x03020302, // ABin
0xaaaaaaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00010001, 0x00010001, 0x00010001,
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000,
0x00030002, // ABin
0x02020202, // BSelt
0x00000000,
0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (ocv_nv12)
{
uniConvertNV12toB_4x4.data[2] = 0x00010000;
uniConvertNV12toB_4x4.data[3] = 0x00230022;
uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
uniConvertNV12toG_4x4.data[2] = 0x01010100;
uniConvertNV12toG_4x4.data[3] = 0x03230322;
uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
uniConvertNV12toG_4x4.data[9] = 0x00003a81;
uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
uniConvertNV12toG_4x4.data[11] = 0x00003a81;
uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
uniConvertNV12toG_4x4.data[13] = 0x00003a81;
uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
uniConvertNV12toG_4x4.data[15] = 0x00003a81;
uniConvertNV12toR_4x4.data[2] = 0x00110010;
uniConvertNV12toR_4x4.data[3] = 0x00330032;
uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
uniExtractUVtoCharSub128_2x8.data[2] = 0x03020100;
uniExtractUVtoCharSub128_2x8.data[3] = 0x07060504;
uniExtractYtoShortSub16_2x8.data[0] = 0x99999999;
uniExtractYtoShortSub16_2x8.data[1] = 0x44444444;
uniExtractYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
uniExtractYtoShortSub16_2x8.data[8] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[9] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[10] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[11] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[12] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[13] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[14] = 0x00010001;
uniExtractYtoShortSub16_2x8.data[15] = 0x00010001;
}
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_2x8", &uniExtractYtoShortSub16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
CHECK_STATUS_FAIL_GOTO(status, OnError);
switch( attr[0]->dtype )
{
case U8:
case I8:
case I16:
{
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case F16:
{
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
default:
break;
}
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
} /* _pre_process_nv12_rggb_copy_initializer() */
DEF_KERNEL_INITIALIZER(_pre_process_nv12_rggb_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t shaderParam = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
float output_zp = 0;
float output_scale = 1;
int32_t reorder = 0;
int32_t order1 = 3;
uint32_t width = 0;
uint32_t height = 0;
uint32_t roi_width = 0;
uint32_t roi_height = 0;
uint32_t xrIntFloat_16 = 0;
uint32_t yrIntFloat_16 = 0;
int32_t xRatio = 0;
int32_t yRatio = 0;
int32_t nv_type = 0;
float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
float resize = 0.0f;
vsi_bool ocv_nv12 = _check_nv12_type_from_env();
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &xRatio);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &yRatio);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &nv_type);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[1]->shape;
output_scale = 1.0f / attr[1]->scale;
output_zp = (float)attr[1]->zero_point;
width = (uint32_t)(out_shape->data[0]);
height = (uint32_t)(out_shape->data[1]);
if (reorder != 0)
{
reorder = 3;
order1 = 0;
}
if (nv_type == VSI_NN_YUV_TYPE_NV21_BGGR)
{
int32_t tmporder = reorder;
reorder = order1;
order1 = tmporder;
}
roi_width = (xRatio * width) >> 15;
roi_height = (yRatio * height) >> 15;
resize = (float)width / roi_width;
xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
outputScaleVar_b = output_scale * b_scale;
outputScaleVar_g = output_scale * g_scale;
outputScaleVar_r = output_scale * r_scale;
bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
/ shaderParam.global_scale[1], 2);
shaderParam.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
{
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertNV12toB_4x4 = {{
0x05050505, // TCfg
0x04040404, // ASelt
0x00210000, 0x00630042, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000,
0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertNV12toG_4x4 = {{
0x29292929, // TCfg
0x14141414, // ASelt
0x03210100, 0x07630542, // ABin
0x2a2a2a2a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertNV12toR_4x4 = {{
0x05050505, // TCfg
0x04040404, // ASelt
0x00310010, 0x00730052, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertYtoShortSub16_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertUVtoCharSub128_2x8 = {{
0x99999999, // TCfg
0x44444444, // ASelt
0x03020100, 0x07060504, // ABin
0xaaaaaaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00010001, 0x00010001, 0x00010001,
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
}, GPU_DP_TYPE_16 };
//trans
gpu_dp_inst_t uniCalculateYShift_2x8 = {{
0x00009999, // TCfg
0x00000000, // ASelt
0x06040200, 0x00000000, // ABin
0x00005555, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniCalculateUVShift_2x8 = {{
0x51515151, // TCfg
0x40404040, // ASelt
0x02020000, 0x06060404, // ABin
0x91919191, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00010000, 0x00000000, 0x00010000,
0x00000000, 0x00010000, 0x00000000, 0x00010000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000,
0x00030002, // ABin
0x02020202, // BSelt
0x00000000,
0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (ocv_nv12)
{
uniConvertNV12toB_4x4.data[2] = 0x00010000;
uniConvertNV12toB_4x4.data[3] = 0x00230022;
uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
uniConvertNV12toG_4x4.data[2] = 0x01010100;
uniConvertNV12toG_4x4.data[3] = 0x03230322;
uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
uniConvertNV12toG_4x4.data[9] = 0x00003a81;
uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
uniConvertNV12toG_4x4.data[11] = 0x00003a81;
uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
uniConvertNV12toG_4x4.data[13] = 0x00003a81;
uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
uniConvertNV12toG_4x4.data[15] = 0x00003a81;
uniConvertNV12toR_4x4.data[2] = 0x00110010;
uniConvertNV12toR_4x4.data[3] = 0x00330032;
uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
uniConvertYtoShortSub16_2x8.data[0] = 0x99999999;
uniConvertYtoShortSub16_2x8.data[1] = 0x44444444;
uniConvertYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
uniConvertYtoShortSub16_2x8.data[8] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[9] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[10] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[11] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[12] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[13] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[14] = 0x00010001;
uniConvertYtoShortSub16_2x8.data[15] = 0x00010001;
}
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYtoShortSub16_2x8", &uniConvertYtoShortSub16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
if (resize >= 0.25)
{
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8);
}
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
CHECK_STATUS_FAIL_GOTO(status, OnError );
switch( attr[1]->dtype )
{
case U8:
case I8:
case I16:
{
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case F16:
{
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
default:
break;
}
}
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _pre_process_nv12_rggb_initializer() */
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel,
const vsi_nn_kernel_param_t * params,
int32_t scale_x
)
{
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_nn_kernel_convert_type_e convert_type = SCALE;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
size_t i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
vsi_size_t dstWidth = outputs[0]->attr.size[0];
float scaleVal = (float)dstWidth / ((scale_x * dstWidth) >> 15);
uint32_t optFlg = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (enable_copy)
{
convert_type = COPY;
}
else
{
convert_type = SCALE;
}
if (scaleVal >= 0.25 && convert_type == SCALE)
{
optFlg = 1;
}
key = HASH_PRE_PROCESS_NV12_RGGB_KEY( input0_dtype, output_dtype, convert_type, optFlg );
for ( i = 0; i < _cnt_of_array(pre_process_nv12_rggb_map); i ++ )
{
if ( pre_process_nv12_rggb_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(pre_process_nv12_rggb_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_nv12_rggb_map[i].function_name );
kernel->info.parameters = vxPreProcessNv12_RGGBKernel_param_def;
kernel->info.numParams = _cnt_of_array( vxPreProcessNv12_RGGBKernel_param_def );
if (convert_type == COPY)
{
kernel->info.initialize = _pre_process_nv12_rggb_copy_initializer;
}
else
{
kernel->info.initialize = _pre_process_nv12_rggb_initializer;
}
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
pre_process_nv12_rggb_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
pre_process_nv12_rggb_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
int32_t trans = 0;
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params, scale_x );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 3;
int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
int32_t left = vsi_nn_kernel_param_get_int32( params, "left" );
int32_t top = vsi_nn_kernel_param_get_int32( params, "top" );
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
int32_t nv_type = vsi_nn_kernel_param_get_int32( params, "nv_type" );
/* Pass parameters to node. */
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM,
inputs, 2, outputs, 1 );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &nv_type );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &tmp_params[3] );
vsi_nn_kernel_scalar_release( &tmp_params[4] );
vsi_nn_kernel_scalar_release( &tmp_params[5] );
vsi_nn_kernel_scalar_release( &tmp_params[6] );
vsi_nn_kernel_scalar_release( &tmp_params[7] );
vsi_nn_kernel_scalar_release( &tmp_params[8] );
vsi_nn_kernel_scalar_release( &tmp_params[9] );
vsi_nn_kernel_scalar_release( &tmp_params[10] );
vsi_nn_kernel_scalar_release( &tmp_params[11] );
vsi_nn_kernel_scalar_release( &tmp_params[12] );
vsi_nn_kernel_scalar_release( &tmp_params[13] );
vsi_nn_kernel_scalar_release( &tmp_params[14] );
vsi_nn_kernel_scalar_release( &tmp_params[15] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( pre_process_nv12_rggb, _setup )

View File

@ -403,23 +403,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
out_shape = attr[0]->shape;
width = (uint32_t)(out_shape->data[0]);
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if ( attr[0]->dfp.fl > 0 )
{
output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
}
else
{
output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
output_zp = (float)attr[0]->asymm.zero_point;
output_scale /= attr[0]->asymm.scale;
}
output_zp = (float)attr[0]->zero_point;
output_scale = 1.0f / attr[0]->scale;
shaderParam.global_scale[0] = 16;
shaderParam.global_scale[1] = 1;
@ -620,8 +605,8 @@ OnError:
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;

View File

@ -463,22 +463,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
width = (uint32_t)(out_shape->data[0] / 3);
height = (uint32_t)(out_shape->data[1]);
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if ( attr[0]->dfp.fl > 0 )
{
output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
}
else
{
output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
output_zp = (float)attr[0]->asymm.zero_point;
output_scale /= attr[0]->asymm.scale;
}
output_zp = (float)attr[0]->zero_point;
output_scale = 1.0f / attr[0]->scale;
if (attr[0]->dtype == F16 || attr[0]->dtype == I16)
{
@ -787,8 +773,8 @@ OnError:
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;

View File

@ -179,28 +179,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
}
enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
}
else
{
outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
}
outputZP = 0;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
outputScale = 1.0f / attr[0]->asymm.scale;
outputZP = (float)attr[0]->asymm.zero_point;
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
outputScale = 1;
outputZP = 0;
}
outputScale = 1.0f / attr[0]->scale;
outputZP = (float)attr[0]->zero_point;
#define _PACK_SELECT_KEY( COPY_FLAG, REVERSE_FLAG, TRANS_FLAG) \
(COPY_FLAG | (REVERSE_FLAG << 24) | (TRANS_FLAG << 16) )

View File

@ -143,23 +143,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
order1 = 0;
}
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
dstScale = 1.0f / attr[0]->asymm.scale;
dstZP = attr[0]->asymm.zero_point;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
}
else
{
dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
}
dstZP = 0;
}
dstScale = 1.0f / attr[0]->scale;
dstZP = attr[0]->zero_point;
shaderParam.global_scale[0] = 16;
shaderParam.global_scale[1] = 1;
@ -501,8 +486,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
dstZP = attr[0]->asymm.zero_point;
dstScale = attr[0]->asymm.scale;
dstZP = attr[0]->zero_point;
dstScale = 1.0f / attr[0]->scale;
width = (uint32_t)(out_shape->data[0]);
height = (uint32_t)(out_shape->data[1]);
@ -512,28 +497,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
order1 = 0;
}
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl);
}
else
{
dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl));
}
dstZP = 0;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
dstScale = 1.0f / dstScale;
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
dstScale = 1;
dstZP = 0;
}
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;

View File

@ -164,46 +164,24 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
out_shape = attr[2]->shape;
out_shape = attr[2]->shape;
inputZP0 = attr[0]->zero_point;
input_scale0 = attr[0]->scale;
inputZP1 = attr[1]->zero_point;
input_scale1 = attr[1]->scale;
outputZP = (float)attr[2]->zero_point;
input_scale0 = input_scale0 / attr[2]->scale;
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
in0_fl = (int8_t)attr[0]->dfp.fl;
if (in0_fl >= 0)
{
input_scale0 = 1.0f / (vx_float32) ((int64_t)1 << in0_fl);
}
else if (in0_fl < 0)
{
input_scale0 = (vx_float32) ((int64_t)1 << -in0_fl);
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
inputZP0 = attr[0]->asymm.zero_point;
input_scale0 = attr[0]->asymm.scale;
}
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
inputZP1 = attr[1]->asymm.zero_point;
input_scale1 = attr[1]->asymm.scale;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
out_fl = (int8_t)attr[2]->dfp.fl;
}
if (out_fl >= 0)
input_scale0 *= (vx_float32)((int64_t)1 << out_fl);
else if (out_fl < 0)
input_scale0 *= 1.0f / (vx_float32) ((int64_t)1 << -out_fl);
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
out_fl = 1;
outputZP = (float)attr[2]->asymm.zero_point;
input_scale0 = input_scale0 / attr[2]->asymm.scale;
}
shift0 = in0_fl - out_fl;
is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);

View File

@ -152,7 +152,6 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t * input_shape = NULL;
vsi_size_array_t * output_shape = NULL;
int32_t input_fl = 0, output_fl = 0;
int32_t axisSize = 0;
float inputScale = 1.0f;
float input_offset_asymmetric = 0.0f;
@ -257,68 +256,19 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
}
}
if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
input_fl = input_attr->dfp.fl;
if (input_fl > 0)
{
inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
}
else
{
inputScale = (float)((int64_t)1 << -input_fl);
}
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
inputScale = input_attr->asymm.scale;
input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
inputScale = 1.0f;
input_offset_asymmetric = 0;
inputScale = input_attr->scale;
input_offset_asymmetric = (float)(input_attr->zero_point);
outputScale = 1.0f / output_attr->scale;
output_offset_asymmetric = (float)(output_attr->zero_point);
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
output_fl = output_attr->dfp.fl;
if (output_fl > 0)
{
outputScale = (float) ((int64_t)1 << output_fl);
}
else
{
outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
}
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
outputScale = 1.0f / output_attr->asymm.scale;
output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
outputScale = 1.0f;
output_offset_asymmetric = 0;
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize );
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_gpu_config( node, &gpu_param );

View File

@ -154,7 +154,6 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t * input_shape = NULL;
vsi_size_array_t * output_shape = NULL;
int32_t input_fl = 0, output_fl = 0;
int32_t axisSize = 0;
float inputScale = 1.0f;
float input_offset_asymmetric = 0.0f;
@ -259,68 +258,18 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
}
}
if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
input_fl = input_attr->dfp.fl;
if (input_fl > 0)
{
inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
}
else
{
inputScale = (float)((int64_t)1 << -input_fl);
}
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
inputScale = input_attr->asymm.scale;
input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
inputScale = 1.0f;
input_offset_asymmetric = 0;
inputScale = input_attr->scale;
input_offset_asymmetric = (float)(input_attr->zero_point);
outputScale = 1.0f / output_attr->scale;
output_offset_asymmetric = (float)(output_attr->zero_point);
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
output_fl = output_attr->dfp.fl;
if (output_fl > 0)
{
outputScale = (float) ((int64_t)1 << output_fl);
}
else
{
outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
}
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
outputScale = 1.0f / output_attr->asymm.scale;
output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
outputScale = 1.0f;
output_offset_asymmetric = 0;
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize );
CHECK_STATUS_FAIL_GOTO(status, final );

View File

@ -160,7 +160,6 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
vsi_size_array_t * output_shape = NULL;
vsi_nn_kernel_dtype_e src_dtype = F16;
vsi_nn_kernel_dtype_e dst_dtype = F16;
int32_t input_fl = 0, output_fl = 0;
int32_t axisSize = 0;
float inputScale = 1.0f;
float input_offset_asymmetric = 0.0f;
@ -348,68 +347,17 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
CHECK_STATUS_FAIL_GOTO(status, final );
}
if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
input_fl = input_attr->dfp.fl;
if (input_fl > 0)
{
inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
}
else
{
inputScale = (float)((int64_t)1 << -input_fl);
}
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
inputScale = input_attr->asymm.scale;
input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
inputScale = 1.0f;
input_offset_asymmetric = 0;
inputScale = input_attr->scale;
input_offset_asymmetric = (float)(input_attr->zero_point);
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
output_fl = output_attr->dfp.fl;
if (output_fl > 0)
{
outputScale = (float) ((int64_t)1 << output_fl);
}
else
{
outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
}
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
outputScale = 1.0f / output_attr->asymm.scale;
output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
outputScale = 1.0f;
output_offset_asymmetric = 0;
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
}
outputScale = 1.0f / output_attr->scale;
output_offset_asymmetric = (float)(output_attr->zero_point);
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_gpu_config( node, &gpu_param );

View File

@ -138,8 +138,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
float inputTail = 0.0f;
float output_ZP = 0;
float input_ZP = 0;
int32_t srcFixPointPos = 0;
int32_t dstFixPointPos = 0;
VSI_UNREFERENCED(param_size);
@ -154,25 +152,10 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
output_dtype = output_attr->dtype;
offset = alpha * threshold;
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
srcFixPointPos = input_attr->dfp.fl;
}
else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
{
input_ZP = (float)(input_attr->asymm.zero_point);
scaleIn = input_attr->asymm.scale;
}
if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
{
dstFixPointPos = output_attr->dfp.fl;
}
else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
{
output_ZP = (float)(output_attr->asymm.zero_point);
scaleOut = 1.0f / output_attr->asymm.scale;
}
input_ZP = (float)(input_attr->zero_point);
scaleIn = input_attr->scale;
output_ZP = (float)(output_attr->zero_point);
scaleOut = 1.0f / output_attr->scale;
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
@ -195,11 +178,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
}
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
if (srcFixPointPos >=0 )
scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
else
scaleIn = (float) ((int64_t)1 << -srcFixPointPos);
status = vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
CHECK_STATUS_FAIL_GOTO(status, final );
}
@ -212,11 +190,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
}
else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
{
if (dstFixPointPos >=0 )
scaleOut = (float) ((int64_t)1 << dstFixPointPos);
else
scaleOut = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
status = vsi_nn_kernel_gpu_add_param(node, "output_scale", &scaleOut);
CHECK_STATUS_FAIL_GOTO(status, final );
}

View File

@ -197,8 +197,6 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
int32_t half_pixel_centers = 0;
uint32_t depth = 0;
int32_t srcFixPointPos = 0;
int32_t dstFixPointPos = 0;
float input_scale = 1.0;
int32_t inputZP = 0;
float output_scale = 1.0;
@ -259,53 +257,10 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
half_pixel_value = 0.0f;
}
if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
{
input_scale = input_attr->asymm.scale;
inputZP = input_attr->asymm.zero_point;
}
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
srcFixPointPos = input_attr->dfp.fl;
if (srcFixPointPos >= 0)
{
input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
}
else if (srcFixPointPos < 0)
{
input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos);
}
inputZP = 0;
}
else
{
input_scale = 1.0f;
inputZP = 0;
}
if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
{
output_scale = output_attr->asymm.scale;
outputZP = output_attr->asymm.zero_point;
}
else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
{
dstFixPointPos = output_attr->dfp.fl;
if (dstFixPointPos >= 0)
{
output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos);
}
else if (dstFixPointPos < 0)
{
output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
}
outputZP = 0;
}
else
{
output_scale = 1.0;
outputZP = 0;
}
input_scale = input_attr->scale;
inputZP = input_attr->zero_point;
output_scale = output_attr->scale;
outputZP = output_attr->zero_point;
if (is_run_nx_kernel)
{
@ -473,7 +428,7 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
}
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
float dfpScale = input_scale * output_scale;
float dfpScale = input_scale / output_scale;
gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt

View File

@ -198,52 +198,19 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
half_pixel_value = 0.0f;
}
if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
{
input_scale = input_attr->asymm.scale;
inputZP = input_attr->asymm.zero_point;
}
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
input_scale = input_attr->scale;
inputZP = input_attr->zero_point;
output_scale = 1.0f / output_attr->scale;
outputZP = output_attr->zero_point;
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
srcFixPointPos = input_attr->dfp.fl;
if (srcFixPointPos >= 0)
{
input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
}
else if (srcFixPointPos < 0)
{
input_scale = (float)((int64_t)1 << -srcFixPointPos);
}
inputZP = 0;
}
else
{
input_scale = 1.0f;
inputZP = 0;
}
if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
{
output_scale = 1.0f / output_attr->asymm.scale;
outputZP = output_attr->asymm.zero_point;
}
else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
{
dstFixPointPos = output_attr->dfp.fl;
if (dstFixPointPos >= 0)
{
output_scale = (float) ((int64_t)1 << dstFixPointPos);
}
else if (dstFixPointPos < 0)
{
output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
}
outputZP = 0;
}
else
{
output_scale = 1.0;
outputZP = 0;
}
if (F16 == input_dtype && F16 == output_dtype)

View File

@ -122,12 +122,16 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
PACK_KERNEL_MAP_DOWN(I16, I16),
PACK_KERNEL_MAP_DOWN(U8, F16),
PACK_KERNEL_MAP_DOWN(U8, U8),
PACK_KERNEL_MAP_DOWN(U16, F16),
PACK_KERNEL_MAP_DOWN(U16, U16),
PACK_KERNEL_MAP_DOWN(F16, F16),
PACK_KERNEL_MAP_DOWN(F16, U8),
PACK_KERNEL_MAP_DOWN(F16, U16),
PACK_KERNEL_MAP_DOWN(BF16, BF16),
PACK_KERNEL_MAP_UP(I8, I8),
PACK_KERNEL_MAP_UP(I16, I16),
PACK_KERNEL_MAP_UP(U8, U8),
PACK_KERNEL_MAP_UP(U16, U16),
PACK_KERNEL_MAP_UP(F16, F16),
PACK_KERNEL_MAP_UP(BF16, BF16),
PACK_KERNEL_MAP_UP_OPT(U8, U8),
@ -223,8 +227,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
int32_t half_pixel_centers;
uint32_t depth = 0;
int32_t srcFixPointPos = 0;
int32_t dstFixPointPos = 0;
float input_scale = 1.0;
int32_t inputZP = 0;
float output_scale = 1.0;
@ -285,201 +287,16 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
half_pixel_value = 0.0f;
}
if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
{
input_scale = input_attr->asymm.scale;
inputZP = input_attr->asymm.zero_point;
}
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
srcFixPointPos = input_attr->dfp.fl;
if (srcFixPointPos >= 0)
{
input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
}
else if (srcFixPointPos < 0)
{
input_scale = (float)((int64_t)1 << -srcFixPointPos);
}
inputZP = 0;
}
else
{
input_scale = 1.0f;
inputZP = 0;
}
if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
{
output_scale = output_attr->asymm.scale;
outputZP = output_attr->asymm.zero_point;
}
else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
{
dstFixPointPos = output_attr->dfp.fl;
if (dstFixPointPos >= 0)
{
output_scale = (float) ((int64_t)1 << dstFixPointPos);
}
else if (dstFixPointPos < 0)
{
output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
}
outputZP = 0;
}
else
{
output_scale = 1.0;
outputZP = 0;
}
input_scale = input_attr->scale;
inputZP = input_attr->zero_point;
output_scale = output_attr->scale;
outputZP = output_attr->zero_point;
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
float dfpScale = input_scale * output_scale;
gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000300, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtact8Bit_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniRightSubLeft_4x4 = {{
0x09090909, // TCfg
0x00000000, // ASelt
0x00230001, 0x00670045, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniDFPtoFp32_left_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
if (I8 == input_dtype && I8 == output_dtype && out_width > in_width)
{
gpu_dp_inst_t uniConvertI32toI16_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniGetMaskShift_2x8 = {{
0x99999999, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x55555555, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{
0x09090909, // TCfg
0x00000000, // ASelt
0x00150004, 0x00370026, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000300, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
&uniConvertDFP2FP32_part1_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
CHECK_STATUS_FAIL_GOTO(status, final );
gpu_param.global_scale[2] = depth;
}
else if (I16 == input_dtype && I16 == output_dtype && out_width > in_width)
{
gpu_dp_inst_t uniConvertI32toI16_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniGetMaskShift_2x8 = {{
0x99999999, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x55555555, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{
0x09090909, // TCfg
0x00000000, // ASelt
0x00150004, 0x00370026, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000300, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
&uniConvertDFP2FP32_part1_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
CHECK_STATUS_FAIL_GOTO(status, final );
gpu_param.global_scale[2] = depth;
}
else
{
status = vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniDFPtoFp32_left_4x4);
CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
status |= vsi_nn_kernel_gpu_add_param( node, "dfpScale", &dfpScale);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (U8 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
if ((U8 == input_dtype || U16 == input_dtype || I8 == input_dtype || I16 == input_dtype))
{
float uint8Scale = input_scale / output_scale;
float uint8ZP_out = (float)outputZP;
@ -615,7 +432,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
}
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype || U16 == output_dtype))
{
float uint8Scale = 1.0f / output_scale;
float uint8ZP_out = (float)outputZP;

View File

@ -0,0 +1,453 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
#define _RESIZE_CUBIC_KERNEL_SOURCE() "resize_cubic"
#define STR(a) #a
// Add kernel hashtable here
#define RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
(( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) )
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
CVIVANTE_NAMESPACE("evis.resize_cubic_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
_RESIZE_CUBIC_KERNEL_SOURCE() }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _resize_cubic_kernel_map[] =
{
PACK_KERNEL_MAP(F16, F16),
PACK_KERNEL_MAP(I16, I16),
PACK_KERNEL_MAP(F16, I16),
PACK_KERNEL_MAP(I16, F16),
PACK_KERNEL_MAP(I8, I8),
PACK_KERNEL_MAP(F16, I8),
PACK_KERNEL_MAP(I8, F16),
PACK_KERNEL_MAP(U8, U8),
PACK_KERNEL_MAP(F16, U8),
PACK_KERNEL_MAP(U8, F16),
};
/*
* Kernel params
*/
static vx_param_description_t _resize_cubic_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
};
#define RESIZE_CUBIC_NUM _cnt_of_array( _resize_cubic_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_resize_cubic_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t *input_attr = NULL;
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
float input_scale = 1.0;
float input_tail = 0;
float output_scale = 1.0;
float output_tail = 0;
VSI_UNREFERENCED(param_size);
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0]);
CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1]);
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
out_shape = output_attr->shape;
if ( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = input_attr->dfp.fl;
if (fl > 0)
{
input_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input_scale = (float)((int64_t)1 << -fl);
}
}
else if ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input_scale = input_attr->asymm.scale;
input_tail = 0 - input_scale * (float)input_attr->asymm.zero_point;
}
if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = output_attr->dfp.fl;
if (fl > 0)
{
output_scale = (float) ((int64_t)1 << fl);
}
else
{
output_scale = 1.0f / (float)((int64_t)1 << -fl);
}
}
else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
output_scale = 1.0f / output_attr->asymm.scale;
output_tail = (float)output_attr->asymm.zero_point;
}
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
{
gpu_dp_inst_t uniFp16ToFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtract8Bit_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniFp16ToFp32_4x4", &uniFp16ToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8);
}
status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale);
status |= vsi_nn_kernel_gpu_add_param( node, "input_tail", &input_tail);
status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
status |= vsi_nn_kernel_gpu_add_param( node, "output_tail", &output_tail);
gpu_param.global_size[0] = gpu_align_p2(
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
return status;
} /* _resize_cubic_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _resize_cubic_kernel_map;
size_t kernel_map_size = _cnt_of_array( _resize_cubic_kernel_map );
vx_param_description_t * param_def = _resize_cubic_kernel_param_def;
size_t param_def_size = RESIZE_CUBIC_NUM;
vx_kernel_initialize_f initializer = _resize_cubic_initializer;
uint32_t key = 0;
uint32_t i = 0;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = RESIZE_CUBIC_HASH_KEY( in_dtype, out_dtype );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (uint32_t)param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_tensor_t* _create_scale_tensor
(
vsi_nn_graph_t *graph,
vsi_size_t output_size,
float scale_factor,
float half_pixel_value,
vsi_nn_tensor_t** index
)
{
vsi_nn_tensor_attr_t attr;
vsi_nn_tensor_t* scale = NULL;
vsi_size_t i = 0;
float *scale_data_ptr = NULL;
int *index_data_ptr = NULL;
float scale_value = 0;
vsi_ssize_t data = 0;
int idx = 0;
float delta_v = 0;
float cubic_coeff_a = -0.5f;
vsi_size_t item_count = 4 * output_size;
scale_data_ptr = (float *)malloc(item_count * sizeof(float));
if (scale_data_ptr == NULL)
{
VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__);
goto OnError;
}
index_data_ptr = (int *)malloc(output_size * sizeof(int));
if (index_data_ptr == NULL)
{
VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__);
goto OnError;
}
for (i = 0; i < output_size; i ++)
{
scale_value = ((float)i + half_pixel_value) * scale_factor - half_pixel_value;
data = (vsi_ssize_t)scale_value;
delta_v = scale_value - (float)data;
idx = (int)data - 1;
index_data_ptr[i] = idx;
scale_data_ptr[i * 4 + 0] = cubic_coeff_a * (((delta_v - 4) * (delta_v + 1) + 8) * (delta_v + 1) - 4);
scale_data_ptr[i * 4 + 1] = ((cubic_coeff_a + 2) * delta_v - (cubic_coeff_a + 3)) * delta_v *delta_v + 1;
scale_data_ptr[i * 4 + 2] = ((cubic_coeff_a + 2) * (1 - delta_v) - (cubic_coeff_a + 3))
* (1 - delta_v) * (1 - delta_v) + 1;
scale_data_ptr[i * 4 + 3] = cubic_coeff_a * ((( 2 - delta_v - 5) * (2 - delta_v) + 8) * (2 - delta_v) - 4);
}
attr.size[0] = item_count;
attr.dim_num = 1;
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
attr.vtl = FALSE;
scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr);
if (scale_data_ptr)
{
free (scale_data_ptr);
scale_data_ptr = NULL;
}
attr.size[0] = output_size;
attr.dim_num = 1;
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
attr.vtl = FALSE;
*index = vsi_nn_CreateTensorFromData(graph, (uint8_t *)index_data_ptr, &attr);
if (index_data_ptr)
{
free (index_data_ptr);
index_data_ptr = NULL;
}
OnError:
return scale;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[RESIZE_CUBIC_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
vsi_size_t in_width = inputs[0]->attr.size[0];
vsi_size_t in_height = inputs[0]->attr.size[1];
vsi_size_t out_width = outputs[0]->attr.size[0];
vsi_size_t out_height = outputs[0]->attr.size[1];
float half_pixel_value = 0.0f;
float width_scale = 0.0f;
float height_scale = 0.0f;
vsi_nn_tensor_t* scale_w = NULL;
vsi_nn_tensor_t* scale_h = NULL;
vsi_nn_tensor_t* index_w = NULL;
vsi_nn_tensor_t* index_h = NULL;
if (align_corners && out_width > 1)
{
width_scale = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
}
else
{
width_scale = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
}
if (align_corners && out_height > 1)
{
height_scale = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
}
else
{
height_scale = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
}
if (half_pixel_centers)
{
half_pixel_value = 0.5f;
}
else
{
half_pixel_value = 0.0f;
}
status = _query_kernel( kernel, inputs, outputs );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
size_t node_params_num = RESIZE_CUBIC_NUM;
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, RESIZE_CUBIC_NUM,
inputs, input_num, outputs, output_num );
scale_w = _create_scale_tensor(graph, out_width,\
width_scale, half_pixel_value, &index_w);
CHECK_PTR_FAIL_GOTO( scale_w, "Create buffer fail.", final );
CHECK_PTR_FAIL_GOTO( index_w, "Create buffer fail.", final );
scale_h = _create_scale_tensor(graph, out_height,\
height_scale, half_pixel_value, &index_h);
CHECK_PTR_FAIL_GOTO( scale_h, "Create buffer fail.", final );
CHECK_PTR_FAIL_GOTO( index_h, "Create buffer fail.", final );
node_params[2] = (vsi_nn_kernel_node_param_t)(scale_w->t);
node_params[3] = (vsi_nn_kernel_node_param_t)(scale_h->t);
node_params[4] = (vsi_nn_kernel_node_param_t)(index_w->t);
node_params[5] = (vsi_nn_kernel_node_param_t)(index_h->t);
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
VSI_ASSERT( status == VSI_SUCCESS );
vsi_nn_kernel_scalar_release( &node_params[2] );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
}
}
final:
vsi_safe_release_tensor(scale_w);
vsi_safe_release_tensor(scale_h);
vsi_safe_release_tensor(index_w);
vsi_safe_release_tensor(index_h);
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( resize_cubic, _setup )

View File

@ -208,52 +208,19 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
half_pixel_value = 0.0f;
}
if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
{
input_scale = input_attr->asymm.scale;
inputZP = input_attr->asymm.zero_point;
}
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
input_scale = input_attr->scale;
inputZP = input_attr->zero_point;
output_scale = 1.0f / output_attr->scale;
outputZP = output_attr->zero_point;
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
srcFixPointPos = input_attr->dfp.fl;
if (srcFixPointPos >= 0)
{
input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
}
else if (srcFixPointPos < 0)
{
input_scale = (float)((int64_t)1 << -srcFixPointPos);
}
inputZP = 0;
}
else
{
input_scale = 1.0f;
inputZP = 0;
}
if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
{
output_scale = 1.0f / output_attr->asymm.scale;
outputZP = output_attr->asymm.zero_point;
}
else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
{
dstFixPointPos = output_attr->dfp.fl;
if (dstFixPointPos >= 0)
{
output_scale = (float) ((int64_t)1 << dstFixPointPos);
}
else if (dstFixPointPos < 0)
{
output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
}
outputZP = 0;
}
else
{
output_scale = 1.0;
outputZP = 0;
}
if (F16 == input_dtype && F16 == output_dtype)

View File

@ -208,10 +208,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
height = (int32_t)(attr[2]->shape->data[1]);
index_num = (int32_t)(attr[0]->shape->data[1]);
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
output_zp = attr[2]->asymm.zero_point;
}
output_zp = attr[2]->zero_point;
if (coord_dim == 3)
{
@ -367,10 +364,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer)
height = (int32_t)(attr[2]->shape->data[1]);
index_num = (int32_t)(attr[0]->shape->data[1]);
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
output_zp = attr[2]->asymm.zero_point;
}
output_zp = attr[2]->zero_point;
if (coord_dim == 3)
{

View File

@ -382,6 +382,12 @@ static vsi_status check_scatter_nd_update_index_repeat
int32_t* mask_buffer = NULL;
int32_t mask_len = 0;
if (indices_num == 1)
{
isRepeat[0] = 0;
return VSI_SUCCESS;
}
if (inputs[1]->attr.is_const == FALSE)
{
isRepeat[0] = 1;
@ -451,7 +457,7 @@ static vsi_status check_scatter_nd_update_index_repeat
else if (mask_buffer[mask_idx] > 0)
{
isRepeat[0] = 1;
status = VSI_FAILURE;
status = VSI_SUCCESS;
CHECK_STATUS_FAIL_GOTO( status, final );
}
}

View File

@ -0,0 +1,861 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
typedef enum
{
NONE = 0,
Add,
Mul,
Max,
Min
} vsi_scatter_nd_update_type_e;
/*
* Define kernel meta.
*/
#define KERNEL_SOURCE_1 "scatter_nd_update_reduction"
#define KERNEL_SOURCE_2 "scatter_nd_update_reduction_conv"
#define HASH_SCATTER_ND_UPDATE_KEY(_in0_type, _in2_type, _out_type, _stage, _op_type) \
((_in0_type << 24) | (_in2_type << 16) | (_out_type << 8) | (_stage << 4) | (_op_type))
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(SRC0_TYPE) \
CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_preprocess_"#SRC0_TYPE)
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, SRC2_TYPE) \
CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_"#REDUCTION_TYPE"_"#SRC2_TYPE)
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(DST_TYPE) \
CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_conv_"#DST_TYPE)
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(IN0_TYPE, SOURCE) \
{ HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, 0, 0, 0), \
HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(IN0_TYPE), \
SOURCE },
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(REDUCTION_TYPE, IN2_TYPE, SOURCE) \
{ HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, REDUCTION_TYPE), \
HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, IN2_TYPE), \
SOURCE },
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(OUT_TYPE, SOURCE) \
{ HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 2, 0), \
HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(OUT_TYPE), \
SOURCE },
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type scatter_nd_update_reduction_preprocess_map[] =
{
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(U8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F16, KERNEL_SOURCE_1)
};
static const _kernel_map_type scatter_nd_update_reduction_process_map[] =
{
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, U8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, U8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, U8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, U8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, BF16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, BF16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, BF16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, BF16, KERNEL_SOURCE_1)
};
static const _kernel_map_type scatter_nd_update_reduction_conv_map[] =
{
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(U8, KERNEL_SOURCE_2)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I8, KERNEL_SOURCE_2)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I16, KERNEL_SOURCE_2)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F16, KERNEL_SOURCE_2)
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(BF16, KERNEL_SOURCE_2)
};
/*
* Kernel params
*/
static vx_param_description_t _scatter_nd_update_preprocess_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
static vx_param_description_t _scatter_nd_update_process_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
static vx_param_description_t _scatter_nd_update_conv_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_preprocess_kernel_param_def)
#define _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_process_kernel_param_def)
#define _SCATTER_ND_UPDATE_CONV_PARAM_NUM _cnt_of_array(_scatter_nd_update_conv_kernel_param_def)
static vsi_status get_scatter_nd_update_tensor_reshape_size
(
vsi_nn_tensor_t ** inputs,
vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
uint32_t block_size,
uint32_t coordDim,
vsi_size_t strides[VSI_NN_MAX_DIM_NUM],
int32_t* newDim
)
{
vsi_status status = VSI_SUCCESS;
uint32_t dims_num = inputs[0]->attr.dim_num;
vsi_size_t *input_size = inputs[0]->attr.size;
uint32_t i = 0;
vsi_size_t elementCnt = 1;
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
newDim[0] = 0;
for (i = 0; i < dims_num; ++i)
{
elementCnt *= input_size[i];
}
for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
{
sizes[i] = 1;
}
sizes[0] = block_size;
sizes[1] = elementCnt / block_size;
newDim[0] = 2;
if (coordDim == 1 && strides) // index shape
{
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
{
strides[i] = 0;
}
}
else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides)
{
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
{
strides[i] = 0;
}
strides[0] = input_size[dims_num - coordDim];
for (i = 1; i < coordDim - 1; i++)
{
strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i];
}
}
#undef VSI_NN_MAX_IMAGE_WIDTH
return status;
} /* _get_EltOP_tensor_reshape_size */
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_scatter_nd_update_preprocess_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
1,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
int32_t width = 0;
int32_t element_size = 1;
int32_t input_zp0 = 0;
float input_scale0 = 1;
int32_t i = 0;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
{
element_size *= (int32_t)attr[0]->shape->data[i];
}
width = element_size / 8;
input_zp0 = attr[0]->zero_point;
input_scale0 = attr[0]->scale;
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
input_scale0 = 1.0f;
}
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
if (element_size < 8)
{
gpu_param.global_size[0] = element_size;
}
else
{
gpu_param.global_size[0] = width;
}
gpu_param.global_size[1] = 1;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, OnError);
{
gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
0x05050505, // TCfg
0x04040404, // ASelt
0x00010000, 0x00030002, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvert2ndU8SubZpToFp32_4x4 = {{
0x09090909, // TCfg
0x04040404, // ASelt
0x00050004, 0x00070006, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000300, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvert2ndU8SubZpToFp32_4x4", &uniConvert2ndU8SubZpToFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale0 );
status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &input_zp0 );
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
} /* _scatter_nd_update_preprocess_initializer() */
DEF_KERNEL_INITIALIZER(_scatter_nd_update_process_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
2,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
int32_t block_size = 1;
int32_t update_width = 1;
int32_t index_num = 1;
int32_t width = 0;
int32_t coord_dim = 0;
int32_t strides[VSI_NN_MAX_DIM_NUM] = {0};
int32_t coord_strides[8] = {0};
int32_t coord_strides1[4] = {0};
int32_t input_zp2 = 0;
float input_scale2 = 1;
int32_t i = 0;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &strides[0]);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &strides[1]);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &strides[2]);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &strides[3]);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &strides[4]);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &strides[5]);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &strides[6]);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &coord_dim);
CHECK_STATUS_FAIL_GOTO(status, OnError );
block_size = (int32_t)(attr[2]->shape->data[0]);
update_width = (int32_t)(attr[1]->shape->data[0]);
index_num = (int32_t)(attr[0]->shape->data[1]);
width = block_size;
input_zp2 = attr[1]->zero_point;
input_scale2 = attr[1]->scale;
coord_strides[coord_dim - 1] = 1;
for (i = 0; i < coord_dim - 1; i++)
{
coord_strides[i] = strides[coord_dim - 2 - i];
}
memcpy(coord_strides1, coord_strides + 4, 4 * sizeof(int32_t));
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = width;
gpu_param.global_size[1] = index_num;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, OnError);
{
gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
0x05050505, // TCfg
0x04040404, // ASelt
0x00010000, 0x00030002, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride", &coord_strides );
status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride1", &coord_strides1 );
CHECK_STATUS_FAIL_GOTO(status, OnError);
if (attr[1]->dtype == U8 || attr[1]->dtype == I8 || attr[1]->dtype == I16)
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "update_scale", &input_scale2 );
status |= vsi_nn_kernel_gpu_add_param( node, "update_zp", &input_zp2 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
else if (attr[1]->dtype == BF16)
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
}
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
if (attr[2])
{
vsi_nn_kernel_tensor_attr_release( &attr[2] );
attr[2] = NULL;
}
return status;
} /* _scatter_nd_update_process_initializer() */
DEF_KERNEL_INITIALIZER(_scatter_nd_update_conv_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
1,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
int32_t width = 0;
int32_t element_size = 1;
int32_t i = 0;
float output_zp = 0;
float output_scale = 1.0f;
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
output_zp = (float)attr[0]->zero_point;
output_scale = (float)1.0 / attr[0]->scale;
for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
{
element_size *= (int32_t)attr[0]->shape->data[i];
}
width = element_size / 8;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
if (element_size < 8)
{
gpu_param.global_size[0] = element_size;
}
else
{
gpu_param.global_size[0] = width;
}
gpu_param.global_size[1] = 1;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, OnError);
{
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp );
status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale );
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
} /* _scatter_nd_update_conv_initializer() */
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel_preprocess,
vsi_nn_kernel_t* kernel_process,
vsi_nn_kernel_t* kernel_conv,
int32_t reduction_flg
)
{
vsi_status status = VSI_SUCCESS;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e input2_dtype = F16;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_SCATTER_ND_UPDATE_KEY(input0_dtype, 0, 0, 0, 0);
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map); i ++ )
{
if ( scatter_nd_update_reduction_preprocess_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map) )
{
snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s",
scatter_nd_update_reduction_preprocess_map[i].function_name );
kernel_preprocess->info.parameters = _scatter_nd_update_preprocess_kernel_param_def;
kernel_preprocess->info.numParams = _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM;
kernel_preprocess->info.initialize = _scatter_nd_update_preprocess_initializer;
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
scatter_nd_update_reduction_preprocess_map[i].source_name );
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
scatter_nd_update_reduction_preprocess_map[i].source_name );
}
else
{
status = VSI_FAILURE;
}
key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, reduction_flg);
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_process_map); i ++ )
{
if ( scatter_nd_update_reduction_process_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(scatter_nd_update_reduction_process_map) )
{
snprintf( kernel_process->info.name, VX_MAX_KERNEL_NAME, "%s",
scatter_nd_update_reduction_process_map[i].function_name );
kernel_process->info.parameters = _scatter_nd_update_process_kernel_param_def;
kernel_process->info.numParams = _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM;
kernel_process->info.initialize = _scatter_nd_update_process_initializer;
vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
scatter_nd_update_reduction_process_map[i].source_name );
vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
scatter_nd_update_reduction_process_map[i].source_name );
}
else
{
status |= VSI_FAILURE;
}
key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 2, 0);
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_conv_map); i ++ )
{
if ( scatter_nd_update_reduction_conv_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(scatter_nd_update_reduction_conv_map) )
{
snprintf( kernel_conv->info.name, VX_MAX_KERNEL_NAME, "%s",
scatter_nd_update_reduction_conv_map[i].function_name );
kernel_conv->info.parameters = _scatter_nd_update_conv_kernel_param_def;
kernel_conv->info.numParams = _SCATTER_ND_UPDATE_CONV_PARAM_NUM;
kernel_conv->info.initialize = _scatter_nd_update_conv_initializer;
vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
scatter_nd_update_reduction_conv_map[i].source_name );
vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
scatter_nd_update_reduction_conv_map[i].source_name );
}
else
{
status |= VSI_FAILURE;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_t node = NULL;
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
vsi_size_t strides[VSI_NN_MAX_DIM_NUM] = {0};
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
int32_t reduction = vsi_nn_kernel_param_get_int32( params, "reduction" );
int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
int32_t i = 0;
vsi_nn_tensor_t * tensors[2] = { NULL };
vsi_nn_kernel_t * ikernels[2] = { NULL };
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
status = get_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0,
NULL, &rs_idx_dim);
status |= get_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0,
NULL, &rs_in_dim);
status |= get_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
strides, &rs_out_dim);
CHECK_STATUS_FAIL_GOTO( status, final );
{
vsi_nn_tensor_attr_t attr;
vsi_nn_kernel_node_t preprocess_node = NULL;
vsi_nn_kernel_node_t process_node = NULL;
vsi_nn_kernel_node_param_t preprocess_params[_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_param_t process_params[_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_param_t conv_params[_SCATTER_ND_UPDATE_CONV_PARAM_NUM] = { NULL };
int32_t width = 1;
int32_t res = 0;
ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
ikernels[0]->unique_id = kernel->unique_id;
ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
ikernels[1]->unique_id = kernel->unique_id;
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
attr.dtype = outputs[0]->attr.dtype;
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
attr.is_const = FALSE;
attr.vtl = TRUE;
for (i = 0; i < rs_out_dim; i++)
{
attr.size[i] = shapes[2][i];
width *= (int32_t)shapes[2][i];
}
attr.dim_num = rs_out_dim;
res = width % 8;
width = (width >> 3) << 3;
tensors[0] = vsi_nn_CreateTensor( graph, &attr ); // ref'
attr.size[0] = 1;
attr.size[1] = 1;
attr.dim_num = rs_out_dim;
tensors[1] = vsi_nn_CreateTensor( graph, &attr ); // link_buffer0
status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], kernel, reduction);
if ( VSI_SUCCESS == status)
{
// convert ref to float
preprocess_node = vsi_nn_kernel_create_node( graph, ikernels[0] );
if (preprocess_node)
{
uint32_t index = 0;
/* Pass parameters to node. */
preprocess_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim );
preprocess_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
status = vsi_nn_kernel_node_pass_param( preprocess_node, preprocess_params,
_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &preprocess_params[0] );
vsi_nn_kernel_scalar_release( &preprocess_params[2] );
vsi_nn_kernel_scalar_release( &preprocess_params[3] );
}
// update
process_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
if (process_node)
{
uint32_t index = 0;
/* Pass parameters to node. */
process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim );
process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim );
process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[3] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[4] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[5] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[6] );
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
status = vsi_nn_kernel_node_pass_param( process_node, process_params,
_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &process_params[0] );
vsi_nn_kernel_tensor_release( &process_params[1] );
vsi_nn_kernel_scalar_release( &process_params[4] );
vsi_nn_kernel_scalar_release( &process_params[5] );
vsi_nn_kernel_scalar_release( &process_params[6] );
vsi_nn_kernel_scalar_release( &process_params[7] );
vsi_nn_kernel_scalar_release( &process_params[8] );
vsi_nn_kernel_scalar_release( &process_params[9] );
vsi_nn_kernel_scalar_release( &process_params[10] );
vsi_nn_kernel_scalar_release( &process_params[11] );
}
// convert float to output
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 0;
/* Pass parameters to node. */
conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
conv_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
status = vsi_nn_kernel_node_pass_param( node, conv_params, _SCATTER_ND_UPDATE_CONV_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &conv_params[2] );
vsi_nn_kernel_scalar_release( &conv_params[3] );
vsi_nn_kernel_scalar_release( &conv_params[4] );
}
}
if (preprocess_node) {vsi_nn_kernel_node_release( &preprocess_node );}
if (process_node) {vsi_nn_kernel_node_release( &process_node );}
}
final:
if (ikernels[0])
{
vsi_nn_kernel_release(&ikernels[0]);
}
if (ikernels[1])
{
vsi_nn_kernel_release(&ikernels[1]);
}
vsi_safe_release_tensor(tensors[0]);
vsi_safe_release_tensor(tensors[1]);
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( scatter_nd_update_reduction, _setup )

View File

@ -22,6 +22,7 @@
*
*****************************************************************************/
#if !(VX_TENSOR_SELECT_VX_SUPPORT)
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -159,7 +160,6 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
vsi_nn_kernel_tensor_attr_t *input1_attr = NULL;
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
int32_t input0_fl = 0, input1_fl = 0, output_fl = 0;
float input0Scale = 1.0f;
int32_t input0Zp = 0;
float input1Scale = 1.0f;
@ -180,59 +180,12 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
input0_fl = input0_attr->dfp.fl;
if (input0_fl > 0)
{
input0Scale = 1.0f / (float) ((int64_t)1 << input0_fl);
}
else
{
input0Scale = (float)((int64_t)1 << -input0_fl);
}
}
else if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input0Scale = input0_attr->asymm.scale;
input0Zp = input0_attr->asymm.zero_point;
}
if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
input1_fl = input1_attr->dfp.fl;
if (input1_fl > 0)
{
input1Scale = 1.0f / (float) ((int64_t)1 << input1_fl);
}
else
{
input1Scale = (float)((int64_t)1 << -input1_fl);
}
}
else if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input1Scale = input1_attr->asymm.scale;
input1Zp = input1_attr->asymm.zero_point;
}
if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
output_fl = output_attr->dfp.fl;
if (output_fl > 0)
{
outputScale = 1.0f / (float) ((int64_t)1 << output_fl);
}
else
{
outputScale = (float)((int64_t)1 << -output_fl);
}
}
else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
outputScale = output_attr->asymm.scale;
outputZP = output_attr->asymm.zero_point;
}
input0Scale = input0_attr->scale;
input0Zp = input0_attr->zero_point;
input1Scale = input1_attr->scale;
input1Zp = input1_attr->zero_point;
outputScale = output_attr->scale;
outputZP = output_attr->zero_point;
gpu_quantize_multiplier_16bit(input0Scale / outputScale, &in0_M0, &in0_postShift);
gpu_quantize_multiplier_16bit(input1Scale / outputScale, &in1_M0, &in1_postShift);
@ -541,3 +494,4 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_EVIS( select, _setup )
#endif

View File

@ -131,42 +131,10 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
out_shape = attr[1]->shape;
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
input_zp = attr[0]->asymm.zero_point;
scaleIn = attr[0]->asymm.scale;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
input_zp = 0;
}
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
output_zp = attr[1]->asymm.zero_point;
scaleOut = 1.0f / attr[1]->asymm.scale;
}
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[1]->dfp.fl > 0)
{
scaleOut = (float)((int64_t)1 << attr[1]->dfp.fl);
}
else
{
scaleOut = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
}
output_zp = 0;
}
input_zp = attr[0]->zero_point;
scaleIn = attr[0]->scale;
output_zp = attr[1]->zero_point;
scaleOut = 1.0f / attr[1]->scale;
outputVal1 = scaleOut + (float)output_zp;

View File

@ -157,8 +157,6 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
float scaleOut = 1.0f;
int32_t output_ZP = 0;
int32_t input_ZP = 0;
int32_t srcFixPointPos = 0;
int32_t dstFixPointPos = 0;
int32_t is_samefl = 0;
uint32_t pack_key = 0;
@ -178,41 +176,10 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
pack_key = _PACK_SLICE_KEY( input_dtype, output_dtype, is_samefl);
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
srcFixPointPos = input_attr->dfp.fl;
if (srcFixPointPos > 0)
{
scaleIn = (1.0f / ((float) ((int64_t)1 << srcFixPointPos)));
}
else
{
scaleIn = ((float) ((int64_t)1 << -srcFixPointPos));
}
}
else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
{
input_ZP = input_attr->asymm.zero_point;
scaleIn = input_attr->asymm.scale;
}
if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
{
dstFixPointPos = output_attr->dfp.fl;
if (dstFixPointPos > 0)
{
scaleOut = (1.0f / ((float) ((int64_t)1 << dstFixPointPos)));
}
else
{
scaleOut = ((float) ((int64_t)1 << -dstFixPointPos));
}
}
else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
{
output_ZP = output_attr->asymm.zero_point;
scaleOut = output_attr->asymm.scale;
}
input_ZP = input_attr->zero_point;
scaleIn = input_attr->scale;
output_ZP = output_attr->zero_point;
scaleOut = output_attr->scale;
if ((I8 == input_dtype && input_dtype == output_dtype ) ||
(U8 == input_dtype && input_dtype == output_dtype ) )

View File

@ -170,23 +170,8 @@ DEF_KERNEL_INITIALIZER(_get_matrix_initializer)
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
if ( attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr->dfp.fl;
if (fl > 0)
{
input_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input_scale = (float)((int64_t)1 << -fl);
}
}
else if ( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input_scale = attr->asymm.scale;
input_tail = 0 - attr->asymm.zero_point * input_scale;
}
input_scale = attr->scale;
input_tail = 0 - attr->zero_point * input_scale;
in_shape = attr->shape;
@ -265,42 +250,10 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[0]->dfp.fl;
if (fl > 0)
{
input_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input_scale = (float)((int64_t)1 << -fl);
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input_scale = attr[0]->asymm.scale;
input_tail = 0 - attr[0]->asymm.zero_point * input_scale;
}
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
int32_t fl = attr[1]->dfp.fl;
if (fl >= 0)
{
output_scale = (vx_float32) ((vx_int64)1 << fl);
}
else if (fl < 0)
{
output_scale = 1.0f / (vx_float32) ((vx_int64)1 << -fl);
}
}
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
output_scale = 1.0f / attr[1]->asymm.scale;
output_zp = (float)attr[1]->asymm.zero_point;
}
input_scale = attr[0]->scale;
input_tail = 0 - attr[0]->zero_point * input_scale;
output_scale = 1.0f / attr[1]->scale;
output_zp = (float)attr[1]->zero_point;
out_shape = attr[1]->shape;

View File

@ -166,8 +166,6 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
vx_tensor input = (vx_tensor)param[0];
vx_tensor output = (vx_tensor)param[1];
int8_t srcFixPointPos = 0;
int8_t dstFixPointPos = 0;
vx_float32 inputTail = 0;
vx_float32 inputScale = 1.0f;
vx_float32 outputZP = 0;
@ -186,42 +184,11 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
out_shape = output_attr->shape;
inputScale = input_attr->scale;
inputTail = 0 - (vx_float32)input_attr->zero_point * inputScale;
outputScale = 1.0f / output_attr->scale;
outputZP = (vx_float32)(output_attr->zero_point);
if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
{
srcFixPointPos = (int8_t)input_attr->dfp.fl;
if (srcFixPointPos > 0)
{
inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
}
else
{
inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos);
}
}
else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
inputScale = input_attr->asymm.scale;
inputTail = 0 - input_attr->asymm.zero_point * inputScale;
}
if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
{
dstFixPointPos = (int8_t)output_attr->dfp.fl;
if (dstFixPointPos > 0)
{
outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos);
}
else
{
outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
}
}
else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
outputScale = 1.0f / output_attr->asymm.scale;
outputZP = (vx_float32)(output_attr->asymm.zero_point);
}
#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \
(IN_TYPE | ( OUT_TYPE << 16))
@ -379,8 +346,6 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)
vx_tensor input = (vx_tensor)param[0];
vx_tensor output = (vx_tensor)param[1];
int8_t srcFixPointPos = 0;
int8_t dstFixPointPos = 0;
vx_float32 inputTail = 0;
vx_float32 inputScale = 1.0f;
vx_float32 outputZP = 0;
@ -398,42 +363,11 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
out_shape = output_attr->shape;
inputScale = input_attr->scale;
inputTail = 0 - (vx_float32)input_attr->zero_point * inputScale;
outputScale = 1.0f / output_attr->scale;
outputZP = (vx_float32)(output_attr->zero_point);
if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
{
srcFixPointPos = (int8_t)input_attr->dfp.fl;
if (srcFixPointPos > 0)
{
inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
}
else
{
inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos);
}
}
else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
inputScale = input_attr->asymm.scale;
inputTail = 0 - input_attr->asymm.zero_point * inputScale;
}
if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
{
dstFixPointPos = (int8_t)output_attr->dfp.fl;
if (dstFixPointPos > 0)
{
outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos);
}
else
{
outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
}
}
else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
outputScale = 1.0f / output_attr->asymm.scale;
outputZP = (vx_float32)(output_attr->asymm.zero_point);
}
#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \
(IN_TYPE | ( OUT_TYPE << 16))

Some files were not shown because too many files have changed in this diff Show More