Update internal ovxlib to rel/1.2.2 (#674)
Update to SHA:806fcd6a69d333e62508acf0a6aa2c38c8385eae Type: Code Improvement Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
parent
cf099e3849
commit
2d9e614a06
|
|
@ -3,6 +3,9 @@
|
|||
##
|
||||
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
|
||||
|
||||
# Some header file
|
||||
include/vsi_nn_feature_config.h
|
||||
|
||||
# User-specific files
|
||||
*.suo
|
||||
*.user
|
||||
|
|
|
|||
|
|
@ -195,3 +195,5 @@ DEF_OP(GRID_SAMPLE)
|
|||
DEF_OP(LPNORM)
|
||||
DEF_OP(RESIZE_3D)
|
||||
DEF_OP(REDUCEL2)
|
||||
DEF_OP(CROP_AND_RESIZE)
|
||||
DEF_OP(TAN)
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
|
|||
VSI_NN_KERNEL_LUT_ATANH = 21,
|
||||
VSI_NN_KERNEL_LUT_ACOSH = 22,
|
||||
VSI_NN_KERNEL_LUT_INVERSE_SIGMOID = 23,
|
||||
VSI_NN_KERNEL_LUT_TAN = 24,
|
||||
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -106,10 +106,21 @@ enum
|
|||
BI_LSTM_BW_INPUT_LAYERNORM_C = 54,
|
||||
BI_LSTM_BW_INPUT_LAYERNORM_O = 55,
|
||||
|
||||
BI_LSTM_FW_INPUT_BIAS_R2I = 56,
|
||||
BI_LSTM_FW_INPUT_BIAS_R2F = 57,
|
||||
BI_LSTM_FW_INPUT_BIAS_R2C = 58,
|
||||
BI_LSTM_FW_INPUT_BIAS_R2O = 59,
|
||||
|
||||
BI_LSTM_BW_INPUT_BIAS_R2I = 60,
|
||||
BI_LSTM_BW_INPUT_BIAS_R2F = 61,
|
||||
BI_LSTM_BW_INPUT_BIAS_R2C = 62,
|
||||
BI_LSTM_BW_INPUT_BIAS_R2O = 63,
|
||||
|
||||
BI_LSTM_INPUT_CNT,
|
||||
|
||||
BI_LSTM_FW_OUTPUT_OUTPUT = 0,
|
||||
BI_LSTM_BW_OUTPUT_OUTPUT = 1,
|
||||
|
||||
BI_LSTM_OUTPUT_CNT
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,47 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_CROP_AND_RESIZE_H
|
||||
#define _VSI_NN_OP_CROP_AND_RESIZE_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_crop_and_resize_param
|
||||
{
|
||||
struct _crop_and_resize_local_data_t * lcl_data;
|
||||
const int32_t* crop_size;
|
||||
vsi_enum resize_method;
|
||||
float extrapolation_value;
|
||||
} vsi_nn_crop_and_resize_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -70,6 +70,11 @@ enum
|
|||
LSTM_INPUT_AUX_WEIGHT_I2C = 27,
|
||||
LSTM_INPUT_AUX_WEIGHT_I2O = 28,
|
||||
|
||||
LSTM_INPUT_BIAS_R2I = 29,
|
||||
LSTM_INPUT_BIAS_R2F = 30,
|
||||
LSTM_INPUT_BIAS_R2C = 31,
|
||||
LSTM_INPUT_BIAS_R2O = 32,
|
||||
|
||||
LSTM_INPUT_CNT,
|
||||
|
||||
LSTM_OUTPUT_OUTPUT = 0,
|
||||
|
|
|
|||
|
|
@ -74,6 +74,11 @@ enum
|
|||
LSTMUNIT_INPUT_AUX_WEIGHT_I2C = 27,
|
||||
LSTMUNIT_INPUT_AUX_WEIGHT_I2O = 28,
|
||||
|
||||
LSTMUNIT_INPUT_BIAS_R2I = 29,
|
||||
LSTMUNIT_INPUT_BIAS_R2F = 30,
|
||||
LSTMUNIT_INPUT_BIAS_R2C = 31,
|
||||
LSTMUNIT_INPUT_BIAS_R2O = 32,
|
||||
|
||||
LSTMUNIT_INPUT_CNT,
|
||||
|
||||
LSTMUNIT_OUTPUT_OUTPUT = 0,
|
||||
|
|
|
|||
|
|
@ -38,7 +38,8 @@ typedef uint32_t vsi_nn_interpolation_type_t; enum
|
|||
{
|
||||
VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR = 0,
|
||||
VSI_NN_INTERPOLATION_BILINEAR,
|
||||
VSI_NN_INTERPOLATION_AREA
|
||||
VSI_NN_INTERPOLATION_AREA,
|
||||
VSI_NN_INTERPOLATION_CUBIC
|
||||
};
|
||||
|
||||
typedef uint32_t vsi_nn_resize_layout_type_t; enum
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ extern "C" {
|
|||
typedef struct _vsi_nn_scatter_nd_update_param
|
||||
{
|
||||
vsi_bool use_locking;
|
||||
vsi_nn_reduction_type_e reduction;
|
||||
} vsi_nn_scatter_nd_update_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -471,6 +471,12 @@ char* vsi_nn_getenv
|
|||
const char * var_name
|
||||
);
|
||||
|
||||
int32_t vsi_nn_getenv_asint
|
||||
(
|
||||
const char* env,
|
||||
int32_t default_value
|
||||
);
|
||||
|
||||
FILE* vsi_nn_fopen
|
||||
(
|
||||
const char * file_name,
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ class IDevice {
|
|||
OVXLIB_API IDevice(uint32_t id);
|
||||
OVXLIB_API ~IDevice();
|
||||
OVXLIB_API uint32_t Id() const;
|
||||
OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, bool (*func)(const void*), data_t data);
|
||||
OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data);
|
||||
OVXLIB_API bool GraphRemove(const vsi_nn_graph_t* graph);
|
||||
OVXLIB_API bool ThreadExit();
|
||||
|
|
|
|||
|
|
@ -79,6 +79,8 @@ typedef struct _vsi_nn_runtime_option_t
|
|||
int32_t enable_dataconvert_optimize;
|
||||
int32_t enable_stream_processor;
|
||||
int32_t enable_rgb88_planar_nhwc;
|
||||
int32_t enable_slice_optimize;
|
||||
int32_t enable_batch_opt;
|
||||
} vsi_nn_runtime_option_t;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -1,3 +1,26 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2019 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the Software),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
/*****Auto generated header file, Please DO NOT modify manually!*****/
|
||||
#ifndef _VSI_NN_FEATURE_CONFIG_H
|
||||
#define _VSI_NN_FEATURE_CONFIG_H
|
||||
|
|
@ -20,5 +43,15 @@
|
|||
#define VSI_CONCAT_ENHANCE_SUPPORT
|
||||
#endif
|
||||
#define VSI_CREATE_TENSOR_FROM_VIEW_SUPPORT
|
||||
#ifndef VSI_SWAP_HANDLE_CACHE_SUPPORT
|
||||
#define VSI_SWAP_HANDLE_CACHE_SUPPORT
|
||||
#endif
|
||||
#define VSI_EXPORT_APIS_FOR_SETUP_GRAPH 1
|
||||
#if defined(VX_SET_TENSOR_MEMPOOL_TYPE_SUPPORT) && VX_SET_TENSOR_MEMPOOL_TYPE_SUPPORT
|
||||
#define VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT
|
||||
#endif
|
||||
#if defined(VX_13_NN_COMPATIBLITY)
|
||||
#define VSI_MAP_TENSOR_PATCH_SUPPORT
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -382,6 +382,31 @@ OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromView
|
|||
vsi_size_t* end
|
||||
);
|
||||
|
||||
/**
|
||||
* Add a new tensor from AXI-SRAM
|
||||
* Create a new tensor from internal AXI-SRAM and add it to graph.
|
||||
* It just creates the tensor object and does not actually allocate the memory
|
||||
* in AXI-SRAM until the verify graph stage. In the other words, the tensor object is
|
||||
* created beforehand,but the memory for storing its data is not allocate until verify
|
||||
* graph stage. AXI-SRAM is the internal memory resource that memory allocation is done
|
||||
* strategically to optimize performance and resource usage in graph verification.
|
||||
* If there is no enough memory in AXI-SRAM, vsi_nn_VerifyGraph will return VSI_FAILURE
|
||||
* User can't access the tensor memory(read/write tensor data) before the graph has verified,
|
||||
* since the tensor memory is not allocated.
|
||||
* @param[in] graph Graph handle
|
||||
* @param[in] id Optional id to the tensor, set it to VSI_NN_TENSOR_ID_AUTO,
|
||||
* and a new id will be generated.
|
||||
* @param[in] attr Tensor attirbutes to the new tensor.
|
||||
*
|
||||
* @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otheriwse.
|
||||
*/
|
||||
OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromAXISRAM
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_id_t id,
|
||||
vsi_nn_tensor_attr_t * attr
|
||||
);
|
||||
|
||||
/**
|
||||
* Attach tensor to graph
|
||||
* Attach an exist tensor to graph.
|
||||
|
|
@ -796,6 +821,18 @@ OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption
|
|||
size_t size
|
||||
);
|
||||
|
||||
/**
|
||||
* graph shape inference
|
||||
*
|
||||
* @param[in] graph Graph handle
|
||||
*
|
||||
* @return VSI_SUCCESS on success, or appropriate error code otherwise
|
||||
* */
|
||||
OVXLIB_API vsi_status vsi_nn_InferShape
|
||||
(
|
||||
vsi_nn_graph_t* graph
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -155,6 +155,22 @@ OVXLIB_API void vsi_nn_PrintNode
|
|||
vsi_nn_node_id_t id
|
||||
);
|
||||
|
||||
#if VX_GRAPH_BATCH_OPT_SUPPORT
|
||||
/**
|
||||
* Set how much this node is divided into on batch dim.
|
||||
*
|
||||
* @param[in] node Node.
|
||||
* @param[in] split_num.
|
||||
*
|
||||
* @return VSI_SUCCESS on success, or error core otherwise.
|
||||
*/
|
||||
OVXLIB_API vsi_status vsi_nn_SetNodeBatchSplitNum
|
||||
(
|
||||
vsi_nn_node_t* node,
|
||||
int8_t split_num
|
||||
);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Update node attribute
|
||||
* Update openvx node attribute based on ovxlib's node attribute
|
||||
|
|
|
|||
|
|
@ -209,6 +209,7 @@
|
|||
#include "ops/vsi_nn_op_lpnorm.h"
|
||||
#include "ops/vsi_nn_op_resize_3d.h"
|
||||
#include "ops/vsi_nn_op_reducel2.h"
|
||||
#include "ops/vsi_nn_op_crop_and_resize.h"
|
||||
/* custom node head define define */
|
||||
#include "custom/vsi_nn_custom_node_type.h"
|
||||
#include "ops/vsi_nn_op_inverse_sigmoid.h"
|
||||
|
|
@ -406,6 +407,7 @@ typedef union _vsi_nn_nn_param
|
|||
vsi_nn_lpnorm_param lpnorm;
|
||||
vsi_nn_resize_3d_param resize_3d;
|
||||
vsi_nn_reducel2_param reducel2;
|
||||
vsi_nn_crop_and_resize_param crop_and_resize;
|
||||
void* client_param;
|
||||
|
||||
/* custom node data struct define */
|
||||
|
|
|
|||
|
|
@ -35,6 +35,9 @@
|
|||
#if defined(VX_KHR_COMPATIBILITY) && (0x1==VX_KHR_COMPATIBILITY)
|
||||
#include <VX/vx_khr_compatible.h>
|
||||
#endif
|
||||
#ifdef VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT
|
||||
#include <VX/vx_viv_sys.h>
|
||||
#endif
|
||||
|
||||
/*
|
||||
This is a compatibility head file for backward compatibility OpenVX 1.1 spec
|
||||
|
|
|
|||
|
|
@ -89,6 +89,8 @@ typedef enum
|
|||
VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422,
|
||||
VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422,
|
||||
VSI_NN_SOURCE_FORMAT_IMAGE_NV21,
|
||||
VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB,
|
||||
VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR,
|
||||
} vsi_nn_preprocess_source_format_e;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -54,5 +54,10 @@
|
|||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "quantization/vsi_nn_asymmetric_affine.h"
|
||||
#include "quantization/vsi_nn_dynamic_fixed_point.h"
|
||||
|
||||
#if defined(VSI_ENABLE_LCOV_TEST) && VSI_ENABLE_LCOV_TEST
|
||||
#include "lcov/vsi_nn_coverage.h"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -817,6 +817,82 @@ vsi_nn_tensor_t * vsi_nn_dropout_tensor
|
|||
float rate
|
||||
);
|
||||
|
||||
/**
|
||||
* Allows the application to get direct access to a patch of tensor object.
|
||||
* A wrapper api for OpenVX vxMapTensorPatch
|
||||
*
|
||||
* @param[in] graph Graph handle.
|
||||
* @param[in] tensor Tensor handle.
|
||||
* @param[out] ptr The address of a pointer that the function sets to the
|
||||
* address where the requested data can be accessed. The returned (*ptr) address
|
||||
* is only valid between the call to the function and the corresponding call to
|
||||
* vsi_nn_UnmapTensorPatch.
|
||||
* @param [in] usage This declares the access mode for the tensor patch, using
|
||||
* the vsi_nn_accessor_type_e enumeration.
|
||||
* VSI_NN_READ_ONLY: after the function call, the content of the memory location
|
||||
* pointed by (*ptr) contains the tensor patch data. Writing into this memory location
|
||||
* is forbidden and its behavior is undefined.
|
||||
* VSI_NN_READ_AND_WRITE : after the function call, the content of the memory
|
||||
* location pointed by (*ptr) contains the tensor patch data; writing into this memory
|
||||
* is allowed only for the location of items and will result in a modification of the
|
||||
* affected items in the tensor object once the range is unmapped. Writing into
|
||||
* a gap between items (when (*stride) > item size in bytes) is forbidden and its
|
||||
* behavior is undefined.
|
||||
* VSI_NN_WRITE_ONLY: after the function call, the memory location pointed by (*ptr)
|
||||
* contains undefined data; writing each item of the range is required prior to
|
||||
* unmapping. Items not written by the application before unmap will become
|
||||
* undefined after unmap, even if they were well defined before map. Like for
|
||||
* VSI_NN_READ_AND_WRITE, writing into a gap between items is forbidden and its behavior
|
||||
* is undefined.
|
||||
* @return VSI_SUCCESS on success, or error core otherwise.
|
||||
*/
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_MapTensorPatch
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
vsi_nn_tensor_t* tensor,
|
||||
void** ptr,
|
||||
vsi_nn_accessor_type_e usage
|
||||
);
|
||||
|
||||
/**
|
||||
* Unmap and commit potential changes to a tensor object patch that was previously mapped.
|
||||
* Unmapping a tensor patch invalidates the memory location from which the patch could
|
||||
* be accessed by the application. Accessing this memory location after the unmap function
|
||||
* completes has an undefined behavior.
|
||||
* @param[in] graph Graph handle.
|
||||
* @param [in] tensor The reference to the tensor object to unmap.
|
||||
* return VSI_SUCCESS on success, or error core otherwise.
|
||||
*/
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_UnmapTensorPatch
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
vsi_nn_tensor_t* tensor
|
||||
);
|
||||
|
||||
/**
|
||||
* Create a new tensor from internal AXI-SRAM(Kernel driver maped)
|
||||
* It just creates the tensor object and does not actually allocate the memory
|
||||
* in AXI-SRAM until the verify graph stage. In the other words, the tensor
|
||||
* object is created beforehand,but the memory for storing its data is not
|
||||
* allocate until verify graph stage. AXI-SRAM is the internal memory resource
|
||||
* that memory allocation is done strategically to optimize performance and
|
||||
* resource usage in graph verification.
|
||||
* If there is no enough memory in AXI-SRAM, vsi_nn_VerifyGraph will return VSI_FAILURE
|
||||
* User can't access the tensor memory(read/write tensor data) before the graph has verified,
|
||||
* since the tensor memory is not allocated.
|
||||
* @param[in] graph Graph handle
|
||||
* @param[in] attr Tensor attirbutes to the new tensor.
|
||||
*
|
||||
* @return Tensor handle on success, or NULL otherwise.
|
||||
*/
|
||||
OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateTensorFromAXISRAM
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_attr_t * attr
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -115,7 +115,9 @@ typedef enum
|
|||
{
|
||||
VSI_NN_REDUCTION_TYPE_NONE,
|
||||
VSI_NN_REDUCTION_TYPE_ADD,
|
||||
VSI_NN_REDUCTION_TYPE_MUL
|
||||
VSI_NN_REDUCTION_TYPE_MUL,
|
||||
VSI_NN_REDUCTION_TYPE_MAX,
|
||||
VSI_NN_REDUCTION_TYPE_MIN
|
||||
} vsi_nn_reduction_type_e;
|
||||
|
||||
/** Pad mode enum */
|
||||
|
|
@ -269,7 +271,9 @@ typedef enum _vsi_nn_yuv_type
|
|||
typedef enum _vsi_nn_nv_type
|
||||
{
|
||||
VSI_NN_YUV_TYPE_NV12,
|
||||
VSI_NN_YUV_TYPE_NV21
|
||||
VSI_NN_YUV_TYPE_NV21,
|
||||
VSI_NN_YUV_TYPE_NV12_RGGB,
|
||||
VSI_NN_YUV_TYPE_NV21_BGGR
|
||||
}vsi_nn_nv_type;
|
||||
|
||||
typedef enum _vsi_nn_roi_align_type_e
|
||||
|
|
@ -283,6 +287,12 @@ typedef enum _vsi_nn_custom_warp_affine_type_e {
|
|||
VSI_NN_WARP_AFFINE_TYPE_RGB
|
||||
} vsi_nn_custom_warp_affine_type_e;
|
||||
|
||||
typedef enum _vsi_nn_accessor_type_e {
|
||||
VSI_NN_READ_ONLY = VX_READ_ONLY,
|
||||
VSI_NN_WRITE_ONLY = VX_WRITE_ONLY,
|
||||
VSI_NN_READ_AND_WRITE = VX_READ_AND_WRITE
|
||||
} vsi_nn_accessor_type_e;
|
||||
|
||||
/** Deprecated */
|
||||
typedef uint32_t vsi_nn_size_t;
|
||||
|
||||
|
|
|
|||
|
|
@ -32,8 +32,8 @@ extern "C"{
|
|||
#endif
|
||||
|
||||
#define VSI_NN_VERSION_MAJOR 1
|
||||
#define VSI_NN_VERSION_MINOR 1
|
||||
#define VSI_NN_VERSION_PATCH 88
|
||||
#define VSI_NN_VERSION_MINOR 2
|
||||
#define VSI_NN_VERSION_PATCH 2
|
||||
#define VSI_NN_VERSION \
|
||||
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
|
||||
|
||||
|
|
|
|||
|
|
@ -14,6 +14,10 @@ ifeq ($(PLATFORM_VENDOR),1)
|
|||
LOCAL_VENDOR_MODULE := true
|
||||
endif
|
||||
|
||||
$(info Remove $(LOCAL_PATH)/../include/vsi_nn_feature_config.h ...)
|
||||
$(shell rm $(LOCAL_PATH)/../include/vsi_nn_feature_config.h -rf)
|
||||
$(info $(shell bash $(LOCAL_PATH)/../gcc_gen_feature_config_header.sh $(LOCAL_PATH)/..))
|
||||
|
||||
LOCAL_SRC_FILES := \
|
||||
vsi_nn_context.c \
|
||||
vsi_nn_client_op.c \
|
||||
|
|
@ -59,12 +63,6 @@ LOCAL_SRC_FILES += \
|
|||
post/vsi_nn_post_fasterrcnn.c \
|
||||
post/vsi_nn_post_cmupose.c
|
||||
|
||||
LOCAL_SRC_FILES += \
|
||||
cpu_backend/vsi_nn_cpu_backend.c \
|
||||
cpu_backend/vsi_nn_cpu_backend_conv2d.c \
|
||||
cpu_backend/vsi_nn_cpu_backend_deconv2d.c \
|
||||
cpu_backend/npuref_interface.c
|
||||
|
||||
|
||||
LOCAL_SRC_FILES += libnnext/vsi_nn_libnnext_resource.c \
|
||||
libnnext/vsi_nn_vxkernel.c
|
||||
|
|
@ -78,11 +76,10 @@ LOCAL_SRC_FILES += kernel/vsi_nn_kernel.c \
|
|||
kernel/vsi_nn_kernel_param.c \
|
||||
kernel/vsi_nn_kernel_gpu_shape_optimize.c \
|
||||
kernel/vsi_nn_kernel_lut.c \
|
||||
kernel/vsi_nn_spinst.c \
|
||||
kernel/vsi_nn_sp_unit_operation.c \
|
||||
kernel/vsi_nn_sp_lut.c \
|
||||
kernel/vsi_nn_gpu.c
|
||||
|
||||
LOCAL_SRC_FILES += vip/virtual_device.cpp
|
||||
|
||||
LIBNNEXT_KERNEL_SOURCES := $(wildcard $(LOCAL_PATH)/libnnext/ops/kernel/*.c)
|
||||
LOCAL_SRC_FILES += $(LIBNNEXT_KERNEL_SOURCES:$(LOCAL_PATH)/%=%)
|
||||
|
||||
|
|
@ -117,13 +114,14 @@ LOCAL_C_INCLUDES += \
|
|||
$(AQROOT)/sdk/inc/ \
|
||||
$(AQROOT)/sdk/inc/HAL \
|
||||
$(LOCAL_PATH)/../include \
|
||||
$(LOCAL_PATH)/../include/vip \
|
||||
$(LOCAL_PATH)/../include/ops \
|
||||
$(LOCAL_PATH)/../include/utils \
|
||||
$(LOCAL_PATH)/../include/infernce \
|
||||
$(LOCAL_PATH)/../include/client \
|
||||
$(LOCAL_PATH)/../include/cpu_backend \
|
||||
$(LOCAL_PATH)/../include/libnnext \
|
||||
$(LOCAL_PATH)/../src
|
||||
$(LOCAL_PATH)/../src \
|
||||
$(LOCAL_PATH)/../src/vip
|
||||
|
||||
LOCAL_CFLAGS := \
|
||||
-DLINUX \
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#if !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -293,6 +294,16 @@ static vsi_status _query_kernel
|
|||
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && input0_dtype == I16)
|
||||
{
|
||||
input0_dtype = I32;
|
||||
}
|
||||
|
||||
if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && input1_dtype == I16)
|
||||
{
|
||||
input1_dtype = I32;
|
||||
}
|
||||
|
||||
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && output_dtype == I8)
|
||||
{
|
||||
output_dtype = BOOL8;
|
||||
|
|
@ -452,3 +463,4 @@ final:
|
|||
REGISTER_BACKEND_CL( relational_ops, _setup )
|
||||
|
||||
__END_DECLS
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -0,0 +1,359 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
typedef enum _crop_and_resize_type_e
|
||||
{
|
||||
nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
|
||||
bilinear = VSI_NN_INTERPOLATION_BILINEAR,
|
||||
}crop_and_resize_type_e;
|
||||
|
||||
#define _CROP_AND_RESIZE_KERNEL_SOURCE_NAME "crop_and_resize_"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
|
||||
(( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8) | (RESIZE_METHOD))
|
||||
#define CROP_AND_RESIZE_KERNEL( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
|
||||
{ CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ), \
|
||||
CVIVANTE_NAMESPACE("cl.crop_and_resize_"#RESIZE_METHOD"_"#IN_DTYPE"to"#OUT_DTYPE), \
|
||||
_CROP_AND_RESIZE_KERNEL_SOURCE_NAME#RESIZE_METHOD }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _crop_and_resize_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
CROP_AND_RESIZE_KERNEL( U32, U32, nearest_neighbor ),
|
||||
CROP_AND_RESIZE_KERNEL( U32, F32, nearest_neighbor ),
|
||||
CROP_AND_RESIZE_KERNEL( F32, F32, nearest_neighbor),
|
||||
CROP_AND_RESIZE_KERNEL( F32, U32, nearest_neighbor ),
|
||||
CROP_AND_RESIZE_KERNEL( F32, I32, nearest_neighbor),
|
||||
CROP_AND_RESIZE_KERNEL( I32, I32, nearest_neighbor ),
|
||||
CROP_AND_RESIZE_KERNEL( I32, F32, nearest_neighbor),
|
||||
|
||||
CROP_AND_RESIZE_KERNEL( U32, U32, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( U32, F32, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( F32, F32, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( F32, U32, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( F32, I32, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( I32, I32, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( I32, F32, bilinear),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _crop_and_resize_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _CROP_AND_RESIZE_PARAM_NUM _cnt_of_array( _crop_and_resize_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_crop_and_resize_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
|
||||
int32_t crop_width = 0;
|
||||
int32_t crop_height = 0;
|
||||
int32_t image_width = 0;
|
||||
int32_t image_height = 0;
|
||||
int32_t batch_out = 0;
|
||||
float width_scale = 0;
|
||||
float height_scale = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
image_width = (int32_t)(attr[0]->shape->data[0]);
|
||||
image_height = (int32_t)(attr[0]->shape->data[1]);
|
||||
crop_width = (int32_t)(attr[1]->shape->data[0]);
|
||||
crop_height = (int32_t)(attr[1]->shape->data[1]);
|
||||
|
||||
width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
|
||||
height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = (crop_width + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0];
|
||||
gpu_param.global_size[1] = (crop_height + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1];
|
||||
gpu_param.global_size[2] = (batch_out + gpu_param.global_scale[2] - 1)
|
||||
/ gpu_param.global_scale[2];
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _crop_and_resize_initializer() */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t resize_method
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _crop_and_resize_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _crop_and_resize_kernel_map );
|
||||
vx_param_description_t * param_def = _crop_and_resize_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _crop_and_resize_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (F16 == in_dtype)
|
||||
{
|
||||
in_dtype = F32;
|
||||
}
|
||||
else if (U8 == in_dtype)
|
||||
{
|
||||
in_dtype = U32;
|
||||
}
|
||||
else if (I8 == in_dtype || I16 == in_dtype)
|
||||
{
|
||||
in_dtype = I32;
|
||||
}
|
||||
|
||||
if (F16 == out_dtype)
|
||||
{
|
||||
out_dtype = F32;
|
||||
}
|
||||
else if (U8 == out_dtype)
|
||||
{
|
||||
out_dtype = U32;
|
||||
}
|
||||
else if (I8 == out_dtype || I16 == out_dtype)
|
||||
{
|
||||
out_dtype = I32;
|
||||
}
|
||||
|
||||
key = CROP_AND_RESIZE_HASH_KEY( in_dtype, out_dtype, resize_method );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _crop_and_resize_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_CROP_AND_RESIZE_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
|
||||
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
uint32_t ori_depth = (uint32_t)inputs[0]->attr.size[2];
|
||||
uint32_t ori_batchout = (uint32_t)outputs[0]->attr.size[3];
|
||||
float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float inOutScale = input_scale / output_scale;
|
||||
float inOutTile = output_zp - inOutScale * input_zp;
|
||||
|
||||
float extrapolation_value = vsi_nn_kernel_param_get_float32( params, "extrapolation_value" );
|
||||
int32_t resize_method = vsi_nn_kernel_param_get_int32( params, "resize_method" );
|
||||
|
||||
VSI_UNREFERENCED(params);
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
shapes[0][0] = inputs[0]->attr.size[0];
|
||||
shapes[0][1] = inputs[0]->attr.size[1];
|
||||
shapes[0][2] = inputs[0]->attr.size[2] * inputs[0]->attr.size[3];
|
||||
|
||||
shapes[1][0] = outputs[0]->attr.size[0];
|
||||
shapes[1][1] = outputs[0]->attr.size[1];
|
||||
shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
|
||||
|
||||
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
|
||||
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
|
||||
|
||||
if (rs_input == NULL || rs_output == NULL)
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, resize_method );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
node_params[0] = rs_input;
|
||||
node_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t);
|
||||
node_params[2] = (vsi_nn_kernel_node_param_t)(inputs[2]->t);
|
||||
node_params[3] = rs_output;
|
||||
node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &ori_depth );
|
||||
node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &ori_batchout );
|
||||
node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &inOutScale );
|
||||
node_params[7] = vsi_nn_kernel_scalar_create( graph, F32, &inOutTile );
|
||||
node_params[8] = vsi_nn_kernel_scalar_create( graph, F32, &extrapolation_value );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CROP_AND_RESIZE_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
}
|
||||
}
|
||||
final:
|
||||
if (rs_input)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input );
|
||||
}
|
||||
if (rs_output)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_output );
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( crop_and_resize, _setup )
|
||||
|
||||
|
|
@ -22,7 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -228,4 +228,4 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( depth2space_internal, _setup )
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1,300 +0,0 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_DETECT_POST_BOX,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define _DETECT_POST_BOX_KERNEL_SOURCE "detect_post_box"
|
||||
|
||||
#define STR(a) #a
|
||||
// Add kernel hashtable here
|
||||
#define DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
((IN0_DTYPE << 18) | ( IN1_DTYPE << 11 ) | ( OUT_DTYPE << 4))
|
||||
|
||||
#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
{ DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
|
||||
CVIVANTE_NAMESPACE("cl.detect_post_box_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
_DETECT_POST_BOX_KERNEL_SOURCE}
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _detect_post_box_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( F32, F32, F32 ),
|
||||
PACK_KERNEL_MAP( U8, U8, F32 ),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _detect_post_box_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _DETECT_POST_BOX_PARAM_NUM _cnt_of_array( _detect_post_box_kernel_param_def )
|
||||
|
||||
#define _DETECT_POST_BOX_F32_PARAM_NUM 8
|
||||
|
||||
#define SCALAR_SCALE_Y (3)
|
||||
#define SCALAR_SCALE_X (4)
|
||||
#define SCALAR_SCALE_H (5)
|
||||
#define SCALAR_SCALE_W (6)
|
||||
#define SCALAR_LOG_E (7)
|
||||
#define SCALAR_TAIL0 (8)
|
||||
#define SCALAR_TAIL1 (9)
|
||||
#define SCALAR_SCALE0 (10)
|
||||
#define SCALAR_SCALE1 (11)
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
|
||||
vsi_size_array_t * in_shape = NULL;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
VSI_UNREFERENCED(node);
|
||||
|
||||
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
|
||||
in_shape = input_attr->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.dim = 2;
|
||||
gpu_param.global_size[0] = (
|
||||
(in_shape->data[1] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0]);
|
||||
gpu_param.global_size[1] = (
|
||||
(in_shape->data[2] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = 1;
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(input_attr);
|
||||
|
||||
return status;
|
||||
} /* _detect_post_box_initializer() */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_bool *is_use_u8_kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
vsi_nn_kernel_dtype_e in1_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _detect_post_box_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _detect_post_box_kernel_map );
|
||||
vx_param_description_t * param_def = _detect_post_box_kernel_param_def;
|
||||
size_t param_def_size = _cnt_of_array( _detect_post_box_kernel_param_def );
|
||||
vx_kernel_initialize_f initializer = _detect_post_box_initializer;
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
|
||||
if ((U8 == in0_dtype) && (U8 == in1_dtype))
|
||||
{
|
||||
*is_use_u8_kernel = TRUE;
|
||||
param_def_size = _DETECT_POST_BOX_PARAM_NUM;
|
||||
}
|
||||
else
|
||||
{
|
||||
*is_use_u8_kernel = FALSE;
|
||||
param_def_size = _DETECT_POST_BOX_F32_PARAM_NUM;
|
||||
}
|
||||
|
||||
key = DETECT_POST_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype );
|
||||
|
||||
for ( i = 0; i < kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (vx_uint32)param_def_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_DETECT_POST_BOX_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
float logE = (float)(log10(exp(1.0f)) / log10(2.0f));
|
||||
float inv_scale_y = vsi_nn_kernel_param_get_float32( params, "inv_scale_y" );
|
||||
float inv_scale_x = vsi_nn_kernel_param_get_float32( params, "inv_scale_x" );
|
||||
float inv_scale_h = vsi_nn_kernel_param_get_float32( params, "inv_scale_h" );
|
||||
float inv_scale_w = vsi_nn_kernel_param_get_float32( params, "inv_scale_w" );
|
||||
vsi_bool is_use_u8_kernel = FALSE;
|
||||
float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input0Zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float input0Tail = -input0Zp * input0Scale;
|
||||
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
|
||||
float input1Zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
|
||||
float input1Tail = -input1Zp * input1Scale;
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel );
|
||||
|
||||
if ( VSI_SUCCESS == status )
|
||||
{
|
||||
size_t node_params_num = _DETECT_POST_BOX_F32_PARAM_NUM;
|
||||
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _DETECT_POST_BOX_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_y );
|
||||
node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_x );
|
||||
node_params[SCALAR_SCALE_H] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_h );
|
||||
node_params[SCALAR_SCALE_W] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_w );
|
||||
node_params[SCALAR_LOG_E] = vsi_nn_kernel_scalar_create( graph, F32, &logE );
|
||||
if (is_use_u8_kernel)
|
||||
{
|
||||
node_params[SCALAR_TAIL0] = vsi_nn_kernel_scalar_create( graph, F32, &input0Tail );
|
||||
node_params[SCALAR_TAIL1] = vsi_nn_kernel_scalar_create( graph, F32, &input1Tail );
|
||||
node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
|
||||
node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
|
||||
node_params_num = _DETECT_POST_BOX_PARAM_NUM;
|
||||
}
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
|
||||
VSI_ASSERT( status == VSI_SUCCESS );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_H] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_W] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_LOG_E] );
|
||||
if (is_use_u8_kernel)
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL0] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL1] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE0] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE1] );
|
||||
}
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( detect_post_box, _setup )
|
||||
|
|
@ -1,197 +0,0 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
#if 0
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_DETECT_POST_NMS,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define _DETECT_POST_NMS_KERNEL_SOURCE "detect_post_nms"
|
||||
#define _DETECT_POST_NMS_KERNEL_NAME CVIVANTE_NAMESPACE("cl.detect_post_nms")
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
|
||||
(( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
|
||||
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \
|
||||
{ DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ), _DETECT_POST_NMS_KERNEL_NAME, SOURCE }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _detect_post_nms_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( F32, F32, _DETECT_POST_NMS_KERNEL_SOURCE ),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _detect_post_nms_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _DETECT_POST_NMS_PARAM_NUM _cnt_of_array( _detect_post_nms_kernel_param_def )
|
||||
|
||||
#define SCALAR_NMS_TYPE (6)
|
||||
#define SCALAR_MAX_NUM (7)
|
||||
#define SCALAR_MAX_CLASS (8)
|
||||
#define SCALAR_MAX_DETECT (9)
|
||||
#define SCALAR_SCORE_TH (10)
|
||||
#define SCALAR_IOU_TH (11)
|
||||
#define SCALAR_IS_BG (12)
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_detect_post_nms_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
||||
return status;
|
||||
} /* _detect_post_nms_initializer() */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _detect_post_nms_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _detect_post_nms_kernel_map );
|
||||
vx_param_description_t * param_def = _detect_post_nms_kernel_param_def;
|
||||
size_t param_def_size = _cnt_of_array( _detect_post_nms_kernel_param_def );
|
||||
vx_kernel_initialize_f initializer = _detect_post_nms_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = DETECT_POST_NMS_HASH_KEY( in_dtype, out_dtype );
|
||||
|
||||
for ( i = 0; i < kernel_map_size; i++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = param_def_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
#endif
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
VSI_UNREFERENCED(graph);
|
||||
VSI_UNREFERENCED(inputs);
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(outputs);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(params);
|
||||
VSI_UNREFERENCED(kernel);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( detect_post_nms, _setup )
|
||||
|
|
@ -60,6 +60,7 @@ typedef enum
|
|||
UNARY_ATANH,
|
||||
UNARY_ACOSH,
|
||||
UNARY_INVERSE_SIGMOID,
|
||||
UNARY_TAN,
|
||||
} unary_type_e;
|
||||
|
||||
/*
|
||||
|
|
@ -108,6 +109,7 @@ typedef enum
|
|||
#define ATANH_OPERATION atanh
|
||||
#define ACOSH_OPERATION acosh
|
||||
#define INVERSE_SIGMOID_OPERATION inverse_sigmoid
|
||||
#define TAN_OPERATION tan
|
||||
|
||||
#define ADD_UNARY_SH_KERNELS(name) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F32, F32) \
|
||||
|
|
@ -142,6 +144,7 @@ static const struct {
|
|||
ADD_UNARY_SH_KERNELS(ATANH)
|
||||
ADD_UNARY_SH_KERNELS(ACOSH)
|
||||
ADD_UNARY_SH_KERNELS(INVERSE_SIGMOID)
|
||||
ADD_UNARY_SH_KERNELS(TAN)
|
||||
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32, I32)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32, I32)
|
||||
|
|
@ -166,6 +169,7 @@ static const struct {
|
|||
#undef ATANH_OPERATION
|
||||
#undef ACOSH_OPERATION
|
||||
#undef INVERSE_SIGMOID_OPERATION
|
||||
#undef TAN_OPERATION
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
|
|
@ -452,16 +456,22 @@ OnError:
|
|||
REGISTER_BACKEND_CL( KERNEL_NAME, _##KERNEL_NAME##_setup )
|
||||
|
||||
|
||||
#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( sin, UNARY_SIN )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( cos, UNARY_COS )
|
||||
#endif
|
||||
#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( exp, UNARY_EXP )
|
||||
#endif
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( log, UNARY_LOG )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( neg, UNARY_NEG )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( mish, UNARY_MISH )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( round, UNARY_ROUND )
|
||||
#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu, UNARY_GELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu, UNARY_HGELU )
|
||||
#endif
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( selu, UNARY_SELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( celu, UNARY_CELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp, UNARY_RCP )
|
||||
|
|
@ -471,5 +481,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( atan, UNARY_ATAN )
|
|||
REGISTER_ELTWISE_UNARY_BACKEND_CL( atanh, UNARY_ATANH )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( acosh, UNARY_ACOSH )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( inverse_sigmoid, UNARY_INVERSE_SIGMOID )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( tan, UNARY_TAN )
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#if !(VX_TENSOR_GATHER_API_SUPPORT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -420,3 +420,4 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( gather, _setup )
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -90,6 +90,8 @@ static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
|
|||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM _cnt_of_array( _grucell_activation_z_h_kernel_param_def )
|
||||
|
|
@ -97,6 +99,8 @@ static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
|
|||
#define SCALAR_INPUT_TAIL (8)
|
||||
#define SCALAR_OUTPUT_SCALE (9)
|
||||
#define SCALAR_OUTPUT_ZP (10)
|
||||
#define SCALAR_OUTPUT1_SCALE (11)
|
||||
#define SCALAR_OUTPUT1_ZP (12)
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
|
|
@ -244,6 +248,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
float input_tail = -(float)vsi_nn_get_tensor_zero_point(inputs[GRUCELL_ACT_Z_H_HSTATE]) * input_scale;
|
||||
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]);
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]);
|
||||
float output_scale1 = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]);
|
||||
float output_zp1 = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]);
|
||||
|
||||
if( activation != VSI_NN_ACT_TANH )
|
||||
{
|
||||
|
|
@ -268,11 +274,17 @@ static vsi_nn_kernel_node_t _setup
|
|||
graph, F32, &output_scale );
|
||||
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &output_zp );
|
||||
node_params[SCALAR_OUTPUT1_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &output_scale1 );
|
||||
node_params[SCALAR_OUTPUT1_ZP] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &output_zp1 );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT1_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT1_ZP] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ typedef enum _grucell_nn_activation_type_e
|
|||
{
|
||||
SIGMOID = VSI_NN_ACT_SIGMOID,
|
||||
HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
|
||||
RELU = VSI_NN_ACT_RELU,
|
||||
}grucell_nn_activation_type_e;
|
||||
|
||||
#define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE "grucell_reset_after_activation"
|
||||
|
|
@ -71,6 +72,9 @@ static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
|
|||
PACK_KERNEL_MAP( U8, F32, U8, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I32, F32, I32, SIGMOID ),
|
||||
PACK_KERNEL_MAP( F32, F32, F32, SIGMOID ),
|
||||
PACK_KERNEL_MAP( U8, F32, U8, RELU ),
|
||||
PACK_KERNEL_MAP( I32, F32, I32, RELU ),
|
||||
PACK_KERNEL_MAP( F32, F32, F32, RELU ),
|
||||
};
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -360,3 +360,4 @@ final:
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( layer_norm, _setup )
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#if !(VX_LOGSOFTMAX_VX_SUPPORT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -34,6 +35,7 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -41,27 +43,30 @@ __BEGIN_DECLS
|
|||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define HASH_LOG_SOFTMAX_KEY(_axis, _input_type, _output_type, _image_2d) \
|
||||
((_axis << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d))
|
||||
#define HASH_LOG_SOFTMAX_KEY(_axis, _input_type, _output_type, _image_2d, exceed_limit) \
|
||||
((_axis << 24) | (_input_type << 16) | (_output_type << 8) | (_image_2d << 4) | exceed_limit)
|
||||
|
||||
#define VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_axis) \
|
||||
"log_softmax_axis"#_axis
|
||||
|
||||
#define VSI_NN_GEN_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_axis) \
|
||||
"log_softmax_exceed_axis"#_axis
|
||||
|
||||
#define HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE)
|
||||
|
||||
#define TENSOR_LOG_SOFTMAX_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
|
||||
HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
|
||||
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
|
||||
|
||||
#define TENSOR_LOG_SOFTMAX_FLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
|
||||
HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, F32, F32), \
|
||||
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
|
||||
|
||||
#define TENSOR_LOG_SOFTMAX_BFLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
|
||||
HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
|
||||
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
|
||||
|
||||
|
|
@ -69,20 +74,28 @@ __BEGIN_DECLS
|
|||
CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D")
|
||||
|
||||
#define TENSOR_LOG_SOFTMAX_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
|
||||
HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
|
||||
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
|
||||
|
||||
#define TENSOR_LOG_SOFTMAX_FLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
|
||||
HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, F32, F32), \
|
||||
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
|
||||
|
||||
#define TENSOR_LOG_SOFTMAX_BFLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
|
||||
HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
|
||||
VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
|
||||
|
||||
#define HASH_LOG_SOFTMAX_EXCEED_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.log_softmax_exceed_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE)
|
||||
|
||||
#define TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \
|
||||
{ HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 1), \
|
||||
HASH_LOG_SOFTMAX_EXCEED_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
|
||||
VSI_NN_GEN_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(AXIS) },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
|
|
@ -92,31 +105,31 @@ static const struct {
|
|||
TENSOR_LOG_SOFTMAX_FLOAT(0, F32, F32)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT(1, F32, F32)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT(2, F32, F32)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT(0, F16, F16)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT(1, F16, F16)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT(2, F16, F16)
|
||||
TENSOR_LOG_SOFTMAX_BFLOAT(0, BF16, BF16)
|
||||
TENSOR_LOG_SOFTMAX_BFLOAT(1, BF16, BF16)
|
||||
TENSOR_LOG_SOFTMAX_BFLOAT(2, BF16, BF16)
|
||||
|
||||
TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F32, F32)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F32, F32)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F16, F16)
|
||||
TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F16, F16)
|
||||
TENSOR_LOG_SOFTMAX_BFLOAT_2D(0, BF16, BF16)
|
||||
TENSOR_LOG_SOFTMAX_BFLOAT_2D(1, BF16, BF16)
|
||||
|
||||
TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8)
|
||||
TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8)
|
||||
TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8)
|
||||
TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8)
|
||||
TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8)
|
||||
TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8)
|
||||
|
||||
TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8, U8)
|
||||
TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8, U8)
|
||||
TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8, U8)
|
||||
TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8, U8)
|
||||
|
||||
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, U8, U8)
|
||||
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, U8, U8)
|
||||
|
||||
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, F32, F32)
|
||||
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, F32, F32)
|
||||
|
||||
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, BF16)
|
||||
TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, BF16)
|
||||
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -198,12 +211,89 @@ final:
|
|||
return status;
|
||||
} /* _log_softmax_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_log_softmax_exceed_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
2, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0} // globalWorkSize: image size in thread
|
||||
};
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
int32_t axis = 0;
|
||||
int32_t width = 0;
|
||||
int32_t height = 0;
|
||||
int32_t depth = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
|
||||
width = (int32_t)(out_shape->data[0]);
|
||||
height = (int32_t)(out_shape->data[1]);
|
||||
depth = attr[1]->shape->size > 2 ? (int32_t)(out_shape->data[2]) : 1;
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
if (axis == 0)
|
||||
{
|
||||
gpu_param.global_size[0] = 1;
|
||||
gpu_param.global_size[1] = depth;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_size[0] = width;
|
||||
gpu_param.global_size[1] = 1;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
if (axis == 0)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "width", &width );
|
||||
}
|
||||
else
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth );
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "height", &height );
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
}
|
||||
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
int32_t axis,
|
||||
vsi_bool image_2d,
|
||||
vsi_bool exceed_limit,
|
||||
vsi_nn_kernel_t* kernel
|
||||
)
|
||||
{
|
||||
|
|
@ -215,7 +305,17 @@ static vsi_status _query_kernel
|
|||
|
||||
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
key = HASH_LOG_SOFTMAX_KEY( axis, input_dtype, output_dtype, image_2d );
|
||||
|
||||
if (input_dtype == F16)
|
||||
{
|
||||
input_dtype = F32;
|
||||
}
|
||||
if (output_dtype == F16)
|
||||
{
|
||||
output_dtype = F32;
|
||||
}
|
||||
if (exceed_limit) image_2d = vx_false_e;
|
||||
key = HASH_LOG_SOFTMAX_KEY( axis, input_dtype, output_dtype, image_2d, exceed_limit );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
|
||||
{
|
||||
|
|
@ -229,7 +329,14 @@ static vsi_status _query_kernel
|
|||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( kernel_param_def );
|
||||
kernel->info.initialize = _log_softmax_initializer;
|
||||
if (exceed_limit)
|
||||
{
|
||||
kernel->info.initialize = _log_softmax_exceed_initializer;
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel->info.initialize = _log_softmax_initializer;
|
||||
}
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
|
||||
kernel_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
|
|
@ -254,7 +361,14 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
|
||||
vsi_bool image_2d = FALSE;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
|
||||
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
|
||||
uint32_t rank_in = 0;
|
||||
int32_t axis = 0;
|
||||
int32_t new_axis = 0;
|
||||
vsi_bool ret = vx_false_e;
|
||||
vsi_bool exceed_limit = vx_false_e;
|
||||
uint32_t i = 0;
|
||||
float beta = 0;
|
||||
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
|
||||
|
|
@ -270,16 +384,37 @@ static vsi_nn_kernel_node_t _setup
|
|||
scaleValue = scaleValue * beta * inputScale;
|
||||
beta = beta * inputScale;
|
||||
|
||||
if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num )
|
||||
|| axis > 2)
|
||||
if (inputs[0]->attr.size[axis] >= GPU_TENSOR_MAX_WIDTH)
|
||||
{
|
||||
exceed_limit = vx_true_e;
|
||||
}
|
||||
|
||||
ret = vsi_nn_kernel_optimize_softmax_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
||||
shapes[0], &rank_in, &new_axis);
|
||||
|
||||
if (ret)
|
||||
{
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], rank_in );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[0], rank_in );
|
||||
}
|
||||
else
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
image_2d = ((inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1)
|
||||
&& axis != 2);
|
||||
status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
|
||||
if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
|
||||
reshape_tensors[0]->attr.dim_num )
|
||||
|| new_axis > 2 || (new_axis == 2 && exceed_limit))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
image_2d = ((reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1)
|
||||
&& new_axis != 2);
|
||||
status = _query_kernel( inputs, outputs, new_axis, image_2d, exceed_limit, kernel );
|
||||
if( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -287,10 +422,10 @@ static vsi_nn_kernel_node_t _setup
|
|||
if( node )
|
||||
{
|
||||
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
|
||||
inputs, 1, outputs, 1 );
|
||||
reshape_tensors, 1, &reshape_tensors[1], 1 );
|
||||
|
||||
node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &axis );
|
||||
graph, I32, &new_axis );
|
||||
node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &beta );
|
||||
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
|
|
@ -311,9 +446,16 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( log_softmax, _setup )
|
||||
#endif
|
||||
|
|
@ -75,6 +75,9 @@ __BEGIN_DECLS
|
|||
#define HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
|
||||
CVIVANTE_NAMESPACE("cl.gemm_4x_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
|
||||
|
||||
#define HASH_MATRIXMUL_4X_TRANSA_LOCAL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
|
||||
CVIVANTE_NAMESPACE("cl.gemm_4x_transa_local_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
|
||||
|
||||
#define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
|
||||
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0, 0), \
|
||||
HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
|
||||
|
|
@ -90,6 +93,11 @@ __BEGIN_DECLS
|
|||
HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_MATRIXMUL_4X_TRANSA_LOCAL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
|
||||
{HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2, 1, 0), \
|
||||
HASH_MATRIXMUL_4X_TRANSA_LOCAL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
|
||||
{HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 1, 0), \
|
||||
HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
|
||||
|
|
@ -142,6 +150,7 @@ static const struct {
|
|||
TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_3)
|
||||
TENSOR_MATRIXMUL_4X_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4)
|
||||
TENSOR_MATRIXMUL_4X_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4)
|
||||
TENSOR_MATRIXMUL_4X_TRANSA_LOCAL_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4)
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -313,6 +322,49 @@ final:
|
|||
return status;
|
||||
} /* _matrixmul_4x_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_matrixmul_4x_local_initializer)
|
||||
(vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t* param,
|
||||
size_t param_size) {
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr = NULL;
|
||||
vsi_size_t width = 0;
|
||||
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
|
||||
CHECK_PTR_FAIL_GOTO(attr, "Create tensor attr buffer fail.", final);
|
||||
|
||||
width = attr->shape->data[0];
|
||||
|
||||
gpu_param.dim = 2;
|
||||
gpu_param.local_size[0] = 1;
|
||||
gpu_param.local_size[1] = 64;
|
||||
gpu_param.local_size[2] = 1;
|
||||
|
||||
gpu_param.global_scale[0] = 16;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] =
|
||||
(width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];
|
||||
gpu_param.global_size[1] = 64;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
final:
|
||||
if (attr) {
|
||||
vsi_nn_kernel_tensor_attr_release(&attr);
|
||||
attr = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _matrixmul_4x_local_initializer() */
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
|
|
@ -403,7 +455,10 @@ static vsi_status _query_kernel
|
|||
kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def );
|
||||
}
|
||||
|
||||
if (flag_4x) {
|
||||
if ((flag_4x == 2) && (transa == 1)) {
|
||||
kernel->info.initialize = _matrixmul_4x_local_initializer;
|
||||
}
|
||||
else if (flag_4x == 1) {
|
||||
kernel->info.initialize = _matrixmul_4x_initializer;
|
||||
} else {
|
||||
kernel->info.initialize = _matrixmul_initializer;
|
||||
|
|
@ -471,6 +526,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
uint32_t stride_axis_in_out[9] = {0};
|
||||
vsi_nn_tensor_t* tmp_inputs[2] = {NULL};
|
||||
vsi_nn_tensor_t* tmp_outputs[1] = {NULL};
|
||||
vsi_bool shader_cnt_support = FALSE;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
|
@ -585,7 +641,20 @@ static vsi_nn_kernel_node_t _setup
|
|||
rs_out_tensors = vsi_nn_reshape_tensor(graph, tmp_outputs[0], final_shape, final_rank);
|
||||
final_out_tensors[0] = rs_out_tensors;
|
||||
|
||||
flag_4x = 1;
|
||||
|
||||
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||
shader_cnt_support =
|
||||
(graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE;
|
||||
#endif
|
||||
if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
|
||||
{
|
||||
flag_4x = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
flag_4x = 1;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -246,28 +246,49 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
|
||||
vsi_bool image_2d = FALSE;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
|
||||
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
|
||||
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
|
||||
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0_scale;
|
||||
float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
|
||||
float input1_tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1_scale;
|
||||
float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
|
||||
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
|
||||
vsi_size_t new_rank = 0;
|
||||
vsi_bool ret = TRUE;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(params);
|
||||
|
||||
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
|
||||
output_scale = vsi_abs(output_scale) < 1e-5 ? 0.0f : 1.0f / output_scale;
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
ret = vsi_nn_kernel_optimize_eltwise_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num,
|
||||
inputs[1]->attr.size, inputs[1]->attr.dim_num,
|
||||
outputs[0]->attr.size, outputs[0]->attr.dim_num,
|
||||
shapes[0], shapes[1], shapes[2], &new_rank );
|
||||
|
||||
if (ret == FALSE)
|
||||
{
|
||||
return NULL;
|
||||
goto final;
|
||||
}
|
||||
|
||||
image_2d = (outputs[0]->attr.dim_num == 2);
|
||||
status = _query_kernel( inputs, outputs, image_2d, kernel );
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], new_rank );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[1], shapes[1], new_rank );
|
||||
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[2], new_rank );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
|
||||
reshape_tensors[2]->attr.dim_num ) )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
image_2d = (reshape_tensors[2]->attr.dim_num == 2);
|
||||
status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -275,19 +296,19 @@ static vsi_nn_kernel_node_t _setup
|
|||
if ( node )
|
||||
{
|
||||
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
|
||||
inputs, 2, outputs, 1 );
|
||||
reshape_tensors, 2, &reshape_tensors[2], 1 );
|
||||
node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &input0Scale );
|
||||
graph, F32, &input0_scale );
|
||||
node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &input0Tail );
|
||||
graph, F32, &input0_tail );
|
||||
node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &input1Scale );
|
||||
graph, F32, &input1_scale );
|
||||
node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &input1Tail );
|
||||
graph, F32, &input1_tail );
|
||||
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &outputScale );
|
||||
graph, F32, &output_scale );
|
||||
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &outputZP );
|
||||
graph, F32, &output_zp );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
|
||||
|
|
@ -300,6 +321,12 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_safe_release_tensor(reshape_tensors[0]);
|
||||
vsi_safe_release_tensor(reshape_tensors[1]);
|
||||
vsi_safe_release_tensor(reshape_tensors[2]);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
|
|
|
|||
|
|
@ -246,29 +246,49 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
|
||||
vsi_bool image_2d = FALSE;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
|
||||
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
|
||||
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
|
||||
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0_scale;
|
||||
float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
|
||||
float input1_tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1_scale;
|
||||
float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
|
||||
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
|
||||
vsi_size_t new_rank = 0;
|
||||
vsi_bool ret = TRUE;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(params);
|
||||
|
||||
output_scale = vsi_abs(output_scale) < 1e-5 ? 0.0f : 1.0f / output_scale;
|
||||
|
||||
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
|
||||
ret = vsi_nn_kernel_optimize_eltwise_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num,
|
||||
inputs[1]->attr.size, inputs[1]->attr.dim_num,
|
||||
outputs[0]->attr.size, outputs[0]->attr.dim_num,
|
||||
shapes[0], shapes[1], shapes[2], &new_rank );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
if (ret == FALSE)
|
||||
{
|
||||
return NULL;
|
||||
goto final;
|
||||
}
|
||||
|
||||
image_2d = (outputs[0]->attr.dim_num == 2);
|
||||
status = _query_kernel( inputs, outputs, image_2d, kernel );
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], new_rank );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[1], shapes[1], new_rank );
|
||||
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[2], new_rank );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
|
||||
reshape_tensors[2]->attr.dim_num ) )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
image_2d = (reshape_tensors[2]->attr.dim_num == 2);
|
||||
status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -276,19 +296,19 @@ static vsi_nn_kernel_node_t _setup
|
|||
if ( node )
|
||||
{
|
||||
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
|
||||
inputs, 2, outputs, 1 );
|
||||
reshape_tensors, 2, &reshape_tensors[2], 1 );
|
||||
node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &input0Scale );
|
||||
graph, F32, &input0_scale );
|
||||
node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &input0Tail );
|
||||
graph, F32, &input0_tail );
|
||||
node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &input1Scale );
|
||||
graph, F32, &input1_scale );
|
||||
node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &input1Tail );
|
||||
graph, F32, &input1_tail );
|
||||
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &outputScale );
|
||||
graph, F32, &output_scale );
|
||||
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &outputZP );
|
||||
graph, F32, &output_zp );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
|
||||
|
|
@ -301,6 +321,12 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_safe_release_tensor(reshape_tensors[0]);
|
||||
vsi_safe_release_tensor(reshape_tensors[1]);
|
||||
vsi_safe_release_tensor(reshape_tensors[2]);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#if !(VX_TENSOR_POW_API_SUPPORT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -294,4 +295,4 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( pow, _setup )
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -0,0 +1,320 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define _RESIZE_CUBIC_KERNEL_SOURCE() "resize_cubic"
|
||||
|
||||
#define STR(a) #a
|
||||
// Add kernel hashtable here
|
||||
#define RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
|
||||
(( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) )
|
||||
|
||||
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
|
||||
CVIVANTE_NAMESPACE("cl.resize_cubic_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
_RESIZE_CUBIC_KERNEL_SOURCE() }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _resize_cubic_kernel_map[] =
|
||||
{
|
||||
PACK_KERNEL_MAP( F32, F32),
|
||||
PACK_KERNEL_MAP( U8, U8),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _resize_cubic_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
#define SCALAR_SCALE_X (2)
|
||||
#define SCALAR_SCALE_Y (3)
|
||||
#define SCALAR_HALF_PIXEL (4)
|
||||
#define SCALAR_INPUT_SCALE (5)
|
||||
#define SCALAR_INPUT_TAIL (6)
|
||||
#define SCALAR_OUTPUT_SCALE (7)
|
||||
#define SCALAR_OUTPUT_TAIL (8)
|
||||
|
||||
|
||||
#define RESIZE_CUBIC_NUM 5
|
||||
#define RESIZE_CUBIC_QUANT_NUM _cnt_of_array( _resize_cubic_kernel_param_def )
|
||||
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_resize_cubic_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = output_attr->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (
|
||||
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(output_attr);
|
||||
return status;
|
||||
} /* _resize_cubic_initializer() */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_bool *is_use_u8_kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _resize_cubic_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _resize_cubic_kernel_map );
|
||||
vx_param_description_t * param_def = _resize_cubic_kernel_param_def;
|
||||
size_t param_def_size = _cnt_of_array( _resize_cubic_kernel_param_def );
|
||||
vx_kernel_initialize_f initializer = _resize_cubic_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (F16 == in_dtype)
|
||||
{
|
||||
in_dtype = F32;
|
||||
}
|
||||
if (F16 == out_dtype)
|
||||
{
|
||||
out_dtype = F32;
|
||||
}
|
||||
|
||||
if ((U8 == in_dtype) || (U8 == out_dtype))
|
||||
{
|
||||
param_def_size = RESIZE_CUBIC_QUANT_NUM;
|
||||
*is_use_u8_kernel = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
param_def_size = RESIZE_CUBIC_NUM;
|
||||
*is_use_u8_kernel = FALSE;
|
||||
}
|
||||
|
||||
key = RESIZE_CUBIC_HASH_KEY( in_dtype, out_dtype );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (uint32_t)param_def_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[RESIZE_CUBIC_QUANT_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
|
||||
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
|
||||
vsi_size_t in_width = inputs[0]->attr.size[0];
|
||||
vsi_size_t in_height = inputs[0]->attr.size[1];
|
||||
vsi_size_t out_width = outputs[0]->attr.size[0];
|
||||
vsi_size_t out_height = outputs[0]->attr.size[1];
|
||||
float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input_tail = -(input_zp * input_scale);
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float half_pixel_value = 0.0f;
|
||||
float scale_factor_x = 0.0f;
|
||||
float scale_factor_y = 0.0f;
|
||||
vsi_bool is_use_u8_kernel = FALSE;
|
||||
|
||||
if (align_corners && out_width > 1)
|
||||
{
|
||||
scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
|
||||
}
|
||||
|
||||
if (align_corners && out_height > 1)
|
||||
{
|
||||
scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
|
||||
}
|
||||
|
||||
if (half_pixel_centers)
|
||||
{
|
||||
half_pixel_value = 0.5f;
|
||||
}
|
||||
else
|
||||
{
|
||||
half_pixel_value = 0.0f;
|
||||
}
|
||||
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel );
|
||||
if (VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
size_t node_params_num = RESIZE_CUBIC_NUM;
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, RESIZE_CUBIC_QUANT_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x );
|
||||
node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create(graph, F32, &scale_factor_y );
|
||||
node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value );
|
||||
if (is_use_u8_kernel)
|
||||
{
|
||||
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
|
||||
node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input_tail );
|
||||
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
|
||||
node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &output_zp );
|
||||
node_params_num = RESIZE_CUBIC_QUANT_NUM;
|
||||
}
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
|
||||
VSI_ASSERT( status == VSI_SUCCESS );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
|
||||
if (is_use_u8_kernel)
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( resize_cubic, _setup )
|
||||
|
|
@ -0,0 +1,727 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
typedef enum
|
||||
{
|
||||
NONE = 0,
|
||||
Add,
|
||||
Mul,
|
||||
Max,
|
||||
Min
|
||||
} vsi_scatter_nd_update_type_e;
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define KERNEL_SOURCE_1 "scatter_nd_update_reduction"
|
||||
#define KERNEL_SOURCE_2 "scatter_nd_update_reduction_conv"
|
||||
|
||||
#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _stage, _op) \
|
||||
((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_stage << 4) | (_op))
|
||||
|
||||
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_preprocess_"#SRC0_TYPE)
|
||||
|
||||
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, SRC2_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_"#REDUCTION_TYPE"_"#SRC2_TYPE)
|
||||
|
||||
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_conv_"#DST_TYPE)
|
||||
|
||||
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(IN0_TYPE, SOURCE) \
|
||||
{ HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, 0, 0, 0), \
|
||||
HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(IN0_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(REDUCTION_TYPE, IN2_TYPE, SOURCE) \
|
||||
{ HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, REDUCTION_TYPE), \
|
||||
HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, IN2_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(OUT_TYPE, SOURCE) \
|
||||
{ HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 2, 0), \
|
||||
HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type scatter_nd_update_reduction_preprocess_map[] =
|
||||
{
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F32, KERNEL_SOURCE_1)
|
||||
};
|
||||
|
||||
static const _kernel_map_type scatter_nd_update_reduction_process_map[] =
|
||||
{
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F32, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F32, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F32, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F32, KERNEL_SOURCE_1)
|
||||
};
|
||||
|
||||
static const _kernel_map_type scatter_nd_update_reduction_conv_map[] =
|
||||
{
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(U8, KERNEL_SOURCE_2)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I8, KERNEL_SOURCE_2)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I16, KERNEL_SOURCE_2)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F16, KERNEL_SOURCE_2)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F32, KERNEL_SOURCE_2)
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _scatter_nd_update_preprocess_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
|
||||
static vx_param_description_t _scatter_nd_update_process_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
|
||||
static vx_param_description_t _scatter_nd_update_conv_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
|
||||
#define _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_preprocess_kernel_param_def)
|
||||
#define _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_process_kernel_param_def)
|
||||
#define _SCATTER_ND_UPDATE_CONV_PARAM_NUM _cnt_of_array(_scatter_nd_update_conv_kernel_param_def)
|
||||
|
||||
static vsi_status cal_scatter_nd_update_tensor_reshape_size
|
||||
(
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
|
||||
uint32_t block_size,
|
||||
uint32_t coordDim,
|
||||
vsi_size_t strides[VSI_NN_MAX_DIM_NUM],
|
||||
int32_t* newDim
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
uint32_t dims_num = inputs[0]->attr.dim_num;
|
||||
vsi_size_t *input_size = inputs[0]->attr.size;
|
||||
uint32_t i = 0;
|
||||
vsi_size_t elementCnt = 1;
|
||||
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
|
||||
|
||||
newDim[0] = 0;
|
||||
for (i = 0; i < dims_num; ++i)
|
||||
{
|
||||
elementCnt *= input_size[i];
|
||||
}
|
||||
|
||||
for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
|
||||
{
|
||||
sizes[i] = 1;
|
||||
}
|
||||
|
||||
sizes[0] = block_size;
|
||||
sizes[1] = elementCnt / block_size;
|
||||
newDim[0] = 2;
|
||||
|
||||
if (coordDim == 1 && strides) // index shape
|
||||
{
|
||||
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
|
||||
{
|
||||
strides[i] = 0;
|
||||
}
|
||||
}
|
||||
else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides)
|
||||
{
|
||||
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
|
||||
{
|
||||
strides[i] = 0;
|
||||
}
|
||||
|
||||
strides[0] = input_size[dims_num - coordDim];
|
||||
for (i = 1; i < coordDim - 1; i++)
|
||||
{
|
||||
strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i];
|
||||
}
|
||||
}
|
||||
|
||||
#undef VSI_NN_MAX_IMAGE_WIDTH
|
||||
|
||||
return status;
|
||||
} /* cal_scatter_nd_update_tensor_reshape_size */
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_scatter_nd_update_reduction_preprocess_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
1,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
int32_t width = 0;
|
||||
int32_t element_size = 1;
|
||||
int32_t i = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
|
||||
for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
|
||||
{
|
||||
element_size *= (int32_t)attr[0]->shape->data[i];
|
||||
}
|
||||
width = element_size / 8;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
if (element_size < 8)
|
||||
{
|
||||
gpu_param.global_size[0] = element_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_size[0] = width;
|
||||
}
|
||||
gpu_param.global_size[1] = 1;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _scatter_nd_update_reduction_preprocess_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_scatter_nd_update_process_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
2,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
|
||||
int32_t block_size = 1;
|
||||
int32_t index_num = 1;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
block_size = (int32_t)(attr[1]->shape->data[0]);
|
||||
index_num = (int32_t)(attr[0]->shape->data[1]);
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = block_size;
|
||||
gpu_param.global_size[1] = index_num;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _scatter_nd_update_process_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_scatter_nd_update_conv_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
1,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
int32_t width = 0;
|
||||
int32_t element_size = 1;
|
||||
int32_t i = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
|
||||
{
|
||||
element_size *= (int32_t)attr[0]->shape->data[i];
|
||||
}
|
||||
width = element_size / 8;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
if (element_size < 8)
|
||||
{
|
||||
gpu_param.global_size[0] = element_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_size[0] = width;
|
||||
}
|
||||
gpu_param.global_size[1] = 1;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _scatter_nd_update_conv_initializer() */
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel_preprocess,
|
||||
vsi_nn_kernel_t* kernel_process,
|
||||
vsi_nn_kernel_t* kernel_conv,
|
||||
int32_t reduction_flg
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e input2_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
uint32_t key = 0;
|
||||
size_t i = 0;
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, 0, 0, 0, 0 );
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map); i ++ )
|
||||
{
|
||||
if ( scatter_nd_update_reduction_preprocess_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map) )
|
||||
{
|
||||
snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s",
|
||||
scatter_nd_update_reduction_preprocess_map[i].function_name );
|
||||
kernel_preprocess->info.parameters = _scatter_nd_update_preprocess_kernel_param_def;
|
||||
kernel_preprocess->info.numParams = _cnt_of_array( _scatter_nd_update_preprocess_kernel_param_def );
|
||||
kernel_preprocess->info.initialize = _scatter_nd_update_reduction_preprocess_initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
scatter_nd_update_reduction_preprocess_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
scatter_nd_update_reduction_preprocess_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, reduction_flg );
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_process_map); i ++ )
|
||||
{
|
||||
if ( scatter_nd_update_reduction_process_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i < _cnt_of_array(scatter_nd_update_reduction_process_map) )
|
||||
{
|
||||
snprintf( kernel_process->info.name, VX_MAX_KERNEL_NAME, "%s",
|
||||
scatter_nd_update_reduction_process_map[i].function_name );
|
||||
kernel_process->info.parameters = _scatter_nd_update_process_kernel_param_def;
|
||||
kernel_process->info.numParams = _cnt_of_array( _scatter_nd_update_process_kernel_param_def );
|
||||
kernel_process->info.initialize = _scatter_nd_update_process_initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
scatter_nd_update_reduction_process_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
scatter_nd_update_reduction_process_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 2, 0 );
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_conv_map); i ++ )
|
||||
{
|
||||
if ( scatter_nd_update_reduction_conv_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i < _cnt_of_array(scatter_nd_update_reduction_conv_map) )
|
||||
{
|
||||
snprintf( kernel_conv->info.name, VX_MAX_KERNEL_NAME, "%s",
|
||||
scatter_nd_update_reduction_conv_map[i].function_name );
|
||||
kernel_conv->info.parameters = _scatter_nd_update_conv_kernel_param_def;
|
||||
kernel_conv->info.numParams = _cnt_of_array( _scatter_nd_update_conv_kernel_param_def );
|
||||
kernel_conv->info.initialize = _scatter_nd_update_conv_initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
scatter_nd_update_reduction_conv_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
scatter_nd_update_reduction_conv_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
vsi_size_t strides[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t coord_strides[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
|
||||
int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
|
||||
int32_t reduction = vsi_nn_kernel_param_get_int32( params, "reduction" );
|
||||
int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
|
||||
float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input_zp_scale = 0 - input_zp * input_scale;
|
||||
float update_zp = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
|
||||
float update_scale = vsi_nn_get_tensor_scale(inputs[2]);
|
||||
float update_zp_scale = 0 - update_zp * update_scale;
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
|
||||
vsi_nn_tensor_t * tensors[2] = { NULL };
|
||||
vsi_nn_kernel_t * ikernels[2] = { NULL };
|
||||
int32_t i = 0;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = cal_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0,
|
||||
NULL, &rs_idx_dim);
|
||||
status |= cal_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0,
|
||||
NULL, &rs_in_dim);
|
||||
status |= cal_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
|
||||
strides, &rs_out_dim);
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
coord_strides[coord_dim - 1] = 1;
|
||||
for (i = 0; i < coord_dim - 1; i++)
|
||||
{
|
||||
coord_strides[i] = (int32_t)strides[coord_dim - 2 - i];
|
||||
}
|
||||
|
||||
{
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_kernel_node_t preprocess_node = NULL;
|
||||
vsi_nn_kernel_node_t process_node = NULL;
|
||||
vsi_nn_kernel_node_param_t preprocess_params[_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t process_params[_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t conv_params[_SCATTER_ND_UPDATE_CONV_PARAM_NUM] = { NULL };
|
||||
int32_t width = 1;
|
||||
int32_t res = 0;
|
||||
int32_t update_width = (int32_t)shapes[1][0];
|
||||
int32_t output_width = (int32_t)shapes[2][0];
|
||||
|
||||
ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
|
||||
ikernels[0]->unique_id = kernel->unique_id;
|
||||
ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
|
||||
ikernels[1]->unique_id = kernel->unique_id;
|
||||
|
||||
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.dtype = outputs[0]->attr.dtype;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
|
||||
for (i = 0; i < rs_out_dim; i++)
|
||||
{
|
||||
attr.size[i] = shapes[2][i];
|
||||
width *= (int32_t)shapes[2][i];
|
||||
}
|
||||
attr.dim_num = rs_out_dim;
|
||||
|
||||
res = width % 8;
|
||||
width = (width >> 3) << 3;
|
||||
|
||||
tensors[0] = vsi_nn_CreateTensor( graph, &attr ); // ref'
|
||||
attr.size[0] = 1;
|
||||
attr.size[1] = 1;
|
||||
attr.dim_num = rs_out_dim;
|
||||
tensors[1] = vsi_nn_CreateTensor( graph, &attr ); // link_buffer0
|
||||
|
||||
status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], kernel, reduction);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
// convert ref to float
|
||||
preprocess_node = vsi_nn_kernel_create_node( graph, ikernels[0] );
|
||||
if (preprocess_node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
/* Pass parameters to node. */
|
||||
preprocess_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim );
|
||||
preprocess_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
|
||||
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
|
||||
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
|
||||
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp_scale );
|
||||
status = vsi_nn_kernel_node_pass_param( preprocess_node, preprocess_params,
|
||||
_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_tensor_release( &preprocess_params[0] );
|
||||
vsi_nn_kernel_scalar_release( &preprocess_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &preprocess_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &preprocess_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &preprocess_params[5] );
|
||||
}
|
||||
|
||||
// update
|
||||
process_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
|
||||
if (process_node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
/* Pass parameters to node. */
|
||||
process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim );
|
||||
process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim );
|
||||
process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
|
||||
process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[0] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[1] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[2] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[3] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[4] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[5] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[6] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &update_width );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_width );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &update_scale );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &update_zp_scale );
|
||||
status = vsi_nn_kernel_node_pass_param( process_node, process_params,
|
||||
_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_tensor_release( &process_params[0] );
|
||||
vsi_nn_kernel_tensor_release( &process_params[1] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[11] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[12] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[13] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[14] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[15] );
|
||||
}
|
||||
|
||||
// convert float to output
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 0;
|
||||
/* Pass parameters to node. */
|
||||
conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
|
||||
conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
|
||||
conv_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
|
||||
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
|
||||
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
|
||||
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
|
||||
status = vsi_nn_kernel_node_pass_param( node, conv_params, _SCATTER_ND_UPDATE_CONV_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_tensor_release( &conv_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &conv_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &conv_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &conv_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &conv_params[6] );
|
||||
}
|
||||
}
|
||||
|
||||
if (preprocess_node) {vsi_nn_kernel_node_release( &preprocess_node );}
|
||||
if (process_node) {vsi_nn_kernel_node_release( &process_node );}
|
||||
}
|
||||
|
||||
final:
|
||||
if (ikernels[0])
|
||||
{
|
||||
vsi_nn_kernel_release(&ikernels[0]);
|
||||
}
|
||||
if (ikernels[1])
|
||||
{
|
||||
vsi_nn_kernel_release(&ikernels[1]);
|
||||
}
|
||||
vsi_safe_release_tensor(tensors[0]);
|
||||
vsi_safe_release_tensor(tensors[1]);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( scatter_nd_update_reduction, _setup )
|
||||
|
|
@ -22,6 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#if !(VX_TENSOR_SELECT_VX_SUPPORT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -359,3 +360,4 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( select, _setup )
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#if !(VX_TENSOR_TILE_API_SUPPORT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -445,3 +446,4 @@ final:
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( tile, _setup )
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -438,7 +438,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
|
||||
int32_t width = (int32_t)block_size;
|
||||
int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
|
||||
int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
|
||||
int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
|
||||
vsi_bool is_odd_even_sort = FALSE;
|
||||
size_t param_num = _TOPK_PARAM_NUM;
|
||||
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
|
|
|
|||
|
|
@ -106,14 +106,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
|
|||
vsi_nn_kernel_dtype_e output_dtype = F16;
|
||||
vsi_nn_kernel_tensor_attr_t *input0_attr = NULL, *input1_attr = NULL, *output_attr = NULL;
|
||||
vsi_size_array_t *input_shape = NULL;
|
||||
float scaleIn = 1.0f;
|
||||
int32_t input_ZP = 0;
|
||||
float scaleIn1 = 1.0f;
|
||||
int32_t input_ZP1 = 0;
|
||||
float scaleOut = 1.0f;
|
||||
int32_t output_ZP = 0;
|
||||
int32_t fixpoint = 0, fixpoint1 = 0, fixpoint_out = 0;
|
||||
float inScale_dfp, inScale_dfp1;
|
||||
float scaleIn = 1.0f;
|
||||
int32_t input_ZP = 0;
|
||||
float scaleIn1 = 1.0f;
|
||||
int32_t input_ZP1 = 0;
|
||||
float scaleOut = 1.0f;
|
||||
int32_t output_ZP = 0;
|
||||
float eps = 0.0f;
|
||||
float rsEps = 0.0f;
|
||||
float dimRatio = 0.0f;
|
||||
|
|
@ -135,80 +133,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
|
|||
rsEps = (float)(1.0f / sqrtf(eps));
|
||||
dimRatio = (float)(1.0 / (input_shape->data[0]));
|
||||
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_DFP == input0_attr->quant )
|
||||
{
|
||||
fixpoint = input0_attr->dfp.fl;
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input0_attr->quant )
|
||||
{
|
||||
input_ZP = input0_attr->asymm.zero_point;
|
||||
scaleIn = input0_attr->asymm.scale;
|
||||
}
|
||||
else
|
||||
{
|
||||
input_ZP = 0;
|
||||
scaleIn = 1.0f;
|
||||
}
|
||||
|
||||
//input1
|
||||
if ( VSI_NN_KERNEL_QUANT_DFP == input1_attr->quant )
|
||||
{
|
||||
fixpoint1 = input1_attr->dfp.fl;
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant )
|
||||
{
|
||||
input_ZP1 = input1_attr->asymm.zero_point;
|
||||
scaleIn1 = input1_attr->asymm.scale;
|
||||
}
|
||||
else
|
||||
{
|
||||
input_ZP1 = 0;
|
||||
scaleIn1 = 1.0f;
|
||||
}
|
||||
|
||||
//output
|
||||
if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant )
|
||||
{
|
||||
fixpoint_out = output_attr->dfp.fl;
|
||||
if (fixpoint_out >= 0)
|
||||
{
|
||||
scaleOut = 1.0f / (vx_float32) ((int64_t)1 << fixpoint_out);
|
||||
}
|
||||
else
|
||||
{
|
||||
scaleOut = (vx_float32) ((int64_t)1 << -fixpoint_out);
|
||||
}
|
||||
output_ZP = 0;
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
|
||||
{
|
||||
output_ZP = output_attr->asymm.zero_point;
|
||||
scaleOut = output_attr->asymm.scale;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_ZP = 0;
|
||||
scaleOut = 1.0f;
|
||||
}
|
||||
|
||||
if (fixpoint >= 0)
|
||||
{
|
||||
inScale_dfp = 1.0f / (vx_float32) ((int64_t)1 << fixpoint);
|
||||
}
|
||||
else
|
||||
{
|
||||
inScale_dfp = (vx_float32) ((int64_t)1 << -fixpoint);
|
||||
}
|
||||
|
||||
if (fixpoint1 >= 0)
|
||||
{
|
||||
inScale_dfp1 = 1.0f / (vx_float32) ((int64_t)1 << fixpoint1);
|
||||
}
|
||||
else
|
||||
{
|
||||
inScale_dfp1 = (vx_float32) ((int64_t)1 << -fixpoint1);
|
||||
}
|
||||
scaleIn = input0_attr->scale;
|
||||
input_ZP = input0_attr->zero_point;
|
||||
scaleIn1 = input1_attr->scale;
|
||||
input_ZP1 = input1_attr->zero_point;
|
||||
scaleOut = output_attr->scale;
|
||||
output_ZP = output_attr->zero_point;
|
||||
|
||||
gpu_param.global_offset[0] = 0;
|
||||
gpu_param.global_offset[1] = 0;
|
||||
|
|
@ -349,8 +279,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
|
|||
&uniConvertInt16ScaleToFp32Fst_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16ScaleToFp32Sec_4x4",
|
||||
&uniConvertInt16ScaleToFp32Sec_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inScale_i16", &inScale_dfp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &inScale_dfp1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inScale_i16", &scaleIn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &scaleIn1);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
width = (int32_t)input_shape->data[0];
|
||||
|
|
|
|||
|
|
@ -215,41 +215,11 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer)
|
|||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = input_attr->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input_scale = input_attr->asymm.scale;
|
||||
input_tail = 0 - input_scale * (float)input_attr->asymm.zero_point;
|
||||
}
|
||||
input_scale = input_attr->scale;
|
||||
input_tail = 0 - input_scale * (float)input_attr->zero_point;
|
||||
|
||||
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = output_attr->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
output_scale = (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = 1.0f / (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
output_scale = 1.0f / output_attr->asymm.scale;
|
||||
output_zp = (float)output_attr->asymm.zero_point;
|
||||
}
|
||||
output_scale = 1.0f / output_attr->scale;
|
||||
output_zp = (float)output_attr->zero_point;
|
||||
|
||||
pack_key = _PACK_BATCH_NORM_KEY( input_attr->dtype, output_attr->dtype );
|
||||
|
||||
|
|
|
|||
|
|
@ -121,23 +121,20 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
|
|||
vsi_nn_kernel_dtype_e output_dtype = F16;
|
||||
|
||||
uint32_t depth = 0;
|
||||
float half_input0_wh[2];
|
||||
float add_float_value[2];
|
||||
uint32_t in0_width;
|
||||
uint32_t in0_height;
|
||||
uint32_t out_width;
|
||||
uint32_t out_height;
|
||||
int32_t align_corners;
|
||||
float half_input0_wh[2] = {0};
|
||||
float add_float_value[2] = {0};
|
||||
uint32_t in0_width = 0;
|
||||
uint32_t in0_height = 0;
|
||||
uint32_t out_width = 0;
|
||||
uint32_t out_height = 0;
|
||||
int32_t align_corners = 0;
|
||||
|
||||
int32_t src0FixPointPos = 0;
|
||||
int32_t src1FixPointPos = 0;
|
||||
int32_t dstFixPointPos = 0;
|
||||
float input0_scale = 1.0;
|
||||
int32_t input0ZP = 0;
|
||||
float input1_scale = 1.0;
|
||||
int32_t input1ZP = 0;
|
||||
float output_scale = 1.0;
|
||||
int32_t outputZP = 0;
|
||||
float input0_scale = 1.0;
|
||||
int32_t input0ZP = 0;
|
||||
float input1_scale = 1.0;
|
||||
int32_t input1ZP = 0;
|
||||
float output_scale = 1.0;
|
||||
int32_t outputZP = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
|
|
@ -165,54 +162,14 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
|
|||
input1_dtype = input_attr[1]->dtype;
|
||||
output_dtype = output_attr->dtype;
|
||||
|
||||
if (U8 == input0_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant) {
|
||||
input0_scale = input_attr[0]->asymm.scale;
|
||||
input0ZP = input_attr[0]->asymm.zero_point;
|
||||
} else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant) {
|
||||
src0FixPointPos = input_attr[0]->dfp.fl;
|
||||
if (src0FixPointPos >= 0) {
|
||||
input0_scale = 1.0f / (float)((int64_t)1 << src0FixPointPos);
|
||||
} else if (src0FixPointPos < 0) {
|
||||
input0_scale = (float)((int64_t)1 << -src0FixPointPos);
|
||||
}
|
||||
input0ZP = 0;
|
||||
} else {
|
||||
input0_scale = 1.0f;
|
||||
input0ZP = 0;
|
||||
}
|
||||
input0_scale = input_attr[0]->scale;
|
||||
input0ZP = input_attr[0]->zero_point;
|
||||
|
||||
if (U8 == input1_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[1]->quant) {
|
||||
input1_scale = input_attr[1]->asymm.scale;
|
||||
input1ZP = input_attr[1]->asymm.zero_point;
|
||||
} else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[1]->quant) {
|
||||
src1FixPointPos = input_attr[1]->dfp.fl;
|
||||
if (src1FixPointPos >= 0) {
|
||||
input1_scale = 1.0f / (float)((int64_t)1 << src1FixPointPos);
|
||||
} else if (src1FixPointPos < 0) {
|
||||
input1_scale = (float)((int64_t)1 << -src1FixPointPos);
|
||||
}
|
||||
input1ZP = 0;
|
||||
} else {
|
||||
input1_scale = 1.0f;
|
||||
input1ZP = 0;
|
||||
}
|
||||
|
||||
if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) {
|
||||
output_scale = output_attr->asymm.scale;
|
||||
outputZP = output_attr->asymm.zero_point;
|
||||
} else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) {
|
||||
dstFixPointPos = output_attr->dfp.fl;
|
||||
if (dstFixPointPos >= 0) {
|
||||
output_scale = (float)((int64_t)1 << dstFixPointPos);
|
||||
} else if (dstFixPointPos < 0) {
|
||||
output_scale = 1.0f / (float)((int64_t)1 << -dstFixPointPos);
|
||||
}
|
||||
outputZP = 0;
|
||||
} else {
|
||||
output_scale = 1.0;
|
||||
outputZP = 0;
|
||||
}
|
||||
input1_scale = input_attr[1]->scale;
|
||||
input1ZP = input_attr[1]->zero_point;
|
||||
|
||||
output_scale = output_attr->scale;
|
||||
outputZP = output_attr->zero_point;
|
||||
|
||||
in0_width = (uint32_t)(in0_shape->data[0]);
|
||||
in0_height = (uint32_t)(in0_shape->data[1]);
|
||||
|
|
@ -496,7 +453,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
|
|||
I16 == output_dtype)) ||
|
||||
((I8 == input0_dtype && I8 == input1_dtype &&
|
||||
I8 == output_dtype))) {
|
||||
float dfpScale = input0_scale * output_scale;
|
||||
float dfpScale = input0_scale / output_scale;
|
||||
gpu_dp_inst_t uniDFPtoFp32_part0_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
|
|
|
|||
|
|
@ -179,7 +179,6 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
|
|||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
{
|
||||
srcFixPointPos = input_attr->dfp.fl;
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#if !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -319,41 +320,10 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
|
|||
|
||||
out_shape = attr[2]->shape;
|
||||
|
||||
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[0]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input0Scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input0Scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input0Scale = attr[0]->asymm.scale;
|
||||
input0Tail = 0 - attr[0]->asymm.zero_point * input0Scale;
|
||||
}
|
||||
|
||||
if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[1]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input1Scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input1Scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input1Scale = attr[1]->asymm.scale;
|
||||
input1Tail = 0 - attr[1]->asymm.zero_point * input1Scale;
|
||||
}
|
||||
input0Scale = attr[0]->scale;
|
||||
input0Tail = 0 - attr[0]->zero_point * input0Scale;
|
||||
input1Scale = attr[1]->scale;
|
||||
input1Tail = 0 - attr[1]->zero_point * input1Scale;
|
||||
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
|
|
@ -616,3 +586,4 @@ final:
|
|||
REGISTER_BACKEND_EVIS( relational_ops, _setup )
|
||||
|
||||
__END_DECLS
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -152,23 +152,12 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer)
|
|||
out_shape = output_attr->shape;
|
||||
weight_shape = weights_attr->shape;
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
|
||||
{
|
||||
input_ZP = input_attr->asymm.zero_point;
|
||||
scaleIn = input_attr->asymm.scale;
|
||||
}
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_ASYMM == weights_attr->quant )
|
||||
{
|
||||
weight_ZP = weights_attr->asymm.zero_point;
|
||||
scaleWights = weights_attr->asymm.scale;
|
||||
}
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
|
||||
{
|
||||
output_ZP = (float)output_attr->asymm.zero_point;
|
||||
scaleOut = output_attr->asymm.scale;
|
||||
}
|
||||
input_ZP = input_attr->zero_point;
|
||||
scaleIn = input_attr->scale;
|
||||
weight_ZP = weights_attr->zero_point;
|
||||
scaleWights = weights_attr->scale;
|
||||
output_ZP = (float)output_attr->zero_point;
|
||||
scaleOut = output_attr->scale;
|
||||
|
||||
scaleOut = (scaleIn * scaleWights) / scaleOut;
|
||||
input_height = (int32_t)(in_shape->data[1]);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,540 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
typedef enum _crop_and_resize_type_e
|
||||
{
|
||||
nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
|
||||
bilinear = VSI_NN_INTERPOLATION_BILINEAR,
|
||||
}crop_and_resize_type_e;
|
||||
|
||||
#define _CROP_AND_RESIZE_KERNEL_SOURCE_NAME "crop_and_resize_"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
|
||||
(( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8) | (RESIZE_METHOD))
|
||||
#define CROP_AND_RESIZE_KERNEL( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
|
||||
{ CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ), \
|
||||
CVIVANTE_NAMESPACE("evis.crop_and_resize_"#RESIZE_METHOD"_"#IN_DTYPE"to"#OUT_DTYPE), \
|
||||
_CROP_AND_RESIZE_KERNEL_SOURCE_NAME#RESIZE_METHOD }
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _crop_and_resize_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
CROP_AND_RESIZE_KERNEL( U8, U8, nearest_neighbor ),
|
||||
CROP_AND_RESIZE_KERNEL( U8, F16, nearest_neighbor ),
|
||||
CROP_AND_RESIZE_KERNEL( F16, F16, nearest_neighbor),
|
||||
CROP_AND_RESIZE_KERNEL( F16, U8, nearest_neighbor ),
|
||||
CROP_AND_RESIZE_KERNEL( F16, I8, nearest_neighbor),
|
||||
CROP_AND_RESIZE_KERNEL( I8, I8, nearest_neighbor ),
|
||||
CROP_AND_RESIZE_KERNEL( I8, F16, nearest_neighbor),
|
||||
CROP_AND_RESIZE_KERNEL( I16, I16, nearest_neighbor ),
|
||||
CROP_AND_RESIZE_KERNEL( I16, F16, nearest_neighbor),
|
||||
|
||||
CROP_AND_RESIZE_KERNEL( U8, U8, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( U8, F16, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( F16, F16, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( F16, U8, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( F16, I8, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( I8, I8, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( I8, F16, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( I16, I16, bilinear),
|
||||
CROP_AND_RESIZE_KERNEL( I16, F16, bilinear),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _crop_and_resize_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _CROP_AND_RESIZE_PARAM_NUM _cnt_of_array( _crop_and_resize_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_crop_and_resize_nearest_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
|
||||
int32_t crop_width = 0;
|
||||
int32_t crop_height = 0;
|
||||
int32_t image_width = 0;
|
||||
int32_t image_height = 0;
|
||||
int32_t batch_out = 0;
|
||||
float width_scale = 0;
|
||||
float height_scale = 0;
|
||||
float src0ZP = 0;
|
||||
float src0Scale = 1;
|
||||
float dstZP = 0;
|
||||
float dstScale = 1;
|
||||
float inOutScale = 0;
|
||||
float inOutTile = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
src0Scale = attr[0]->scale;
|
||||
src0ZP = (float)attr[0]->zero_point;
|
||||
|
||||
dstScale = attr[1]->scale;
|
||||
dstZP = (float)attr[1]->zero_point;
|
||||
|
||||
inOutScale = src0Scale / dstScale;
|
||||
inOutTile = dstZP - inOutScale * src0ZP;
|
||||
|
||||
image_width = (int32_t)(attr[0]->shape->data[0]);
|
||||
image_height = (int32_t)(attr[0]->shape->data[1]);
|
||||
crop_width = (int32_t)(attr[1]->shape->data[0]);
|
||||
crop_height = (int32_t)(attr[1]->shape->data[1]);
|
||||
|
||||
width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
|
||||
height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
|
||||
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2((crop_width + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 8);
|
||||
gpu_param.global_size[1] = (crop_height + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1];
|
||||
gpu_param.global_size[2] = (batch_out + gpu_param.global_scale[2] - 1)
|
||||
/ gpu_param.global_scale[2];
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
{
|
||||
gpu_dp_inst_t uniExtract8Bit_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertFstToFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_dp_inst_t uniConvertSecToFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "inOutTile", &inOutTile );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _crop_and_resize_nearest_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_crop_and_resize_bilinear_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
|
||||
int32_t crop_width = 0;
|
||||
int32_t crop_height = 0;
|
||||
int32_t image_width = 0;
|
||||
int32_t image_height = 0;
|
||||
int32_t batch_out = 0;
|
||||
float width_scale = 0;
|
||||
float height_scale = 0;
|
||||
float src0ZP = 0;
|
||||
float src0Scale = 1;
|
||||
float dstZP = 0;
|
||||
float dstScale = 1;
|
||||
float inOutScale = 0;
|
||||
float inOutTile = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
src0Scale = attr[0]->scale;
|
||||
src0ZP = (float)attr[0]->zero_point;
|
||||
|
||||
dstScale = attr[1]->scale;
|
||||
dstZP = (float)attr[1]->zero_point;
|
||||
|
||||
inOutScale = src0Scale / dstScale;
|
||||
inOutTile = dstZP - inOutScale * src0ZP;
|
||||
|
||||
image_width = (int32_t)(attr[0]->shape->data[0]);
|
||||
image_height = (int32_t)(attr[0]->shape->data[1]);
|
||||
crop_width = (int32_t)(attr[1]->shape->data[0]);
|
||||
crop_height = (int32_t)(attr[1]->shape->data[1]);
|
||||
|
||||
width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
|
||||
height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
|
||||
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2((crop_width + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (crop_height + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1];
|
||||
gpu_param.global_size[2] = (batch_out + gpu_param.global_scale[2] - 1)
|
||||
/ gpu_param.global_scale[2];
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
{
|
||||
gpu_dp_inst_t uniExtract8Bit_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniRightToFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00030001, 0x00070005, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniLeftToFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00020000, 0x00060004, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001,
|
||||
0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniRightToFp32_4x4", &uniRightToFp32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFp32_4x4", &uniLeftToFp32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "inOutTile", &inOutTile );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _crop_and_resize_bilinear_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t resize_method
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _crop_and_resize_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _crop_and_resize_kernel_map );
|
||||
vx_param_description_t * param_def = _crop_and_resize_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _crop_and_resize_nearest_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (resize_method == bilinear)
|
||||
{
|
||||
initializer = _crop_and_resize_bilinear_initializer;
|
||||
}
|
||||
key = CROP_AND_RESIZE_HASH_KEY( in_dtype, out_dtype, resize_method );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _crop_and_resize_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_CROP_AND_RESIZE_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
|
||||
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
uint32_t ori_depth = (uint32_t)inputs[0]->attr.size[2];
|
||||
uint32_t ori_batchout = (uint32_t)outputs[0]->attr.size[3];
|
||||
float extrapolation_value = vsi_nn_kernel_param_get_float32( params, "extrapolation_value" );
|
||||
int32_t resize_method = vsi_nn_kernel_param_get_int32( params, "resize_method" );
|
||||
|
||||
VSI_UNREFERENCED(params);
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
shapes[0][0] = inputs[0]->attr.size[0];
|
||||
shapes[0][1] = inputs[0]->attr.size[1];
|
||||
shapes[0][2] = inputs[0]->attr.size[2] * inputs[0]->attr.size[3];
|
||||
|
||||
shapes[1][0] = outputs[0]->attr.size[0];
|
||||
shapes[1][1] = outputs[0]->attr.size[1];
|
||||
shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
|
||||
|
||||
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
|
||||
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
|
||||
|
||||
if (rs_input == NULL || rs_output == NULL)
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, resize_method );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
node_params[0] = rs_input;
|
||||
node_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t);
|
||||
node_params[2] = (vsi_nn_kernel_node_param_t)(inputs[2]->t);
|
||||
node_params[3] = rs_output;
|
||||
node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &ori_depth );
|
||||
node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &ori_batchout );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CROP_AND_RESIZE_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
}
|
||||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
vsi_nn_Float32ToDtype(extrapolation_value, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
|
||||
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
CHECK_STATUS(status);
|
||||
}
|
||||
}
|
||||
final:
|
||||
if (rs_input)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input );
|
||||
}
|
||||
if (rs_output)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_output );
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( crop_and_resize, _setup )
|
||||
|
||||
|
|
@ -204,39 +204,11 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
|
|||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
input_scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input_scale = attr[0]->asymm.scale;
|
||||
input_zp = attr[0]->asymm.zero_point;
|
||||
}
|
||||
input_scale = attr[0]->scale;
|
||||
input_zp = attr[0]->zero_point;
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[1]->dfp.fl > 0)
|
||||
{
|
||||
output_scale = (float)((int64_t)1 << attr[1]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
output_scale = 1.0f / attr[1]->asymm.scale;
|
||||
output_zp = (float)attr[1]->asymm.zero_point;
|
||||
}
|
||||
output_scale = 1.0f / attr[1]->scale;
|
||||
output_zp = (float)attr[1]->zero_point;
|
||||
|
||||
in_out_scale = input_scale * output_scale;
|
||||
in_out_zp_scale = (float)in_out_scale * input_zp * (-1);
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -161,51 +161,10 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
|
|||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
src0ZP = attr[0]->asymm.zero_point;
|
||||
src0Scale = attr[0]->asymm.scale;
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
src0ZP = 0;
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
src0Scale = 1;
|
||||
src0ZP = 0;
|
||||
}
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
dstZP = attr[1]->asymm.zero_point;
|
||||
dstScale = attr[1]->asymm.scale;
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[1]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (1.0f / (float)((int64_t)1 << attr[1]->dfp.fl));
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (float)((int64_t)1 << -attr[1]->dfp.fl);
|
||||
}
|
||||
dstZP = 0;
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
dstScale = 1;
|
||||
dstZP = 0;
|
||||
}
|
||||
src0ZP = attr[0]->zero_point;
|
||||
src0Scale = attr[0]->scale;
|
||||
dstZP = attr[1]->zero_point;
|
||||
dstScale = attr[1]->scale;
|
||||
|
||||
output_dims = (uint32_t)attr[1]->shape->size;
|
||||
output_width = (int32_t)(attr[1]->shape->data[0]);
|
||||
|
|
@ -454,4 +413,4 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( depth2space_internal, _setup )
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -250,12 +250,12 @@ DEF_KERNEL_INITIALIZER(_depthwise_conv1d_initializer)
|
|||
gpu_param.global_size[1] = gpu_align_p2((output_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1], gpu_param.local_size[1]);
|
||||
|
||||
outputScale = input_attr->asymm.scale;
|
||||
outputScale = input_attr->scale;
|
||||
|
||||
outputScale *= weight_attr->asymm.scale;
|
||||
weightZP = weight_attr->asymm.zero_point;
|
||||
outputScale /= output_attr->asymm.scale;
|
||||
outputZP = (float)output_attr->asymm.zero_point + 0.5f;
|
||||
outputScale *= weight_attr->scale;
|
||||
weightZP = weight_attr->zero_point;
|
||||
outputScale /= output_attr->scale;
|
||||
outputZP = (float)output_attr->zero_point + 0.5f;
|
||||
|
||||
#define _PACK_SELECT_KEY( kernel_size, dilation, evis_version ) \
|
||||
((uint64_t)kernel_size | ((uint64_t)dilation << 16) | ((uint64_t)evis_version << 32))
|
||||
|
|
|
|||
|
|
@ -135,17 +135,10 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
|
|||
status = vsi_nn_kernel_gpu_add_param( node, "logE", &logE);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
|
||||
{
|
||||
input0_ZP = input_attr->asymm.zero_point;
|
||||
scaleIn0 = input_attr->asymm.scale;
|
||||
}
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant )
|
||||
{
|
||||
input1_ZP = input1_attr->asymm.zero_point;
|
||||
scaleIn1 = input1_attr->asymm.scale;
|
||||
}
|
||||
input0_ZP = input_attr->zero_point;
|
||||
scaleIn0 = input_attr->scale;
|
||||
input1_ZP = input1_attr->zero_point;
|
||||
scaleIn1 = input1_attr->scale;
|
||||
|
||||
if ((F32 == input_attr->dtype) || (F32 == input1_attr->dtype))
|
||||
{
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ typedef enum
|
|||
UNARY_ATANH,
|
||||
UNARY_ACOSH,
|
||||
UNARY_INVERSE_SIGMOID,
|
||||
UNARY_TAN,
|
||||
} unary_type_e;
|
||||
|
||||
/*
|
||||
|
|
@ -108,6 +109,7 @@ typedef enum
|
|||
#define ATANH_OPERATION atanh
|
||||
#define ACOSH_OPERATION acosh
|
||||
#define INVERSE_SIGMOID_OPERATION inverse_sigmoid
|
||||
#define TAN_OPERATION tan
|
||||
|
||||
#define ADD_UNARY_SH_KERNELS(name, source) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_3D) \
|
||||
|
|
@ -153,6 +155,7 @@ static const struct {
|
|||
ADD_UNARY_SH_KERNELS(ATAN, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(ATANH, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(ACOSH, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(TAN, KERNEL_SOURCE1)
|
||||
|
||||
ADD_UNARY_SH_KERNELS(HSIGMOID, KERNEL_SOURCE0)
|
||||
ADD_UNARY_SH_KERNELS(MISH, KERNEL_SOURCE0)
|
||||
|
|
@ -177,6 +180,7 @@ static const struct {
|
|||
#undef RCP_OPERATION
|
||||
#undef SIGN_OPERATION
|
||||
#undef SOFTSIGN_OPERATION
|
||||
#undef TAN_OPERATION
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
|
|
@ -243,41 +247,10 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
|
|||
}
|
||||
out_shape = attr[1]->shape;
|
||||
|
||||
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[0]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
inputScale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
inputScale = attr[0]->asymm.scale;
|
||||
inputTail = 0 - attr[0]->asymm.zero_point * inputScale;
|
||||
}
|
||||
|
||||
if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[1]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
outputScale = (float)((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = (float)1.0f / (float) ((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
outputScale = (float)1.0f / attr[1]->asymm.scale;
|
||||
outputZP = (float)attr[1]->asymm.zero_point;
|
||||
}
|
||||
inputScale = attr[0]->scale;
|
||||
inputTail = 0 - attr[0]->zero_point * inputScale;
|
||||
outputScale = (float)1.0f / attr[1]->scale;
|
||||
outputZP = (float)attr[1]->zero_point;
|
||||
|
||||
#define _PACK_SELECT_KEY( TYPE, IN_TYPE, OUT_TYPE ) \
|
||||
(( TYPE << 24) | ( IN_TYPE << 16) | ( OUT_TYPE << 8))
|
||||
|
|
@ -298,17 +271,23 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
|
|||
|
||||
switch( pack_key )
|
||||
{
|
||||
#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
|
||||
case _PACK_SELECT_KEY( UNARY_SIN, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_COS, BF16, BF16 ):
|
||||
#endif
|
||||
#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
|
||||
case _PACK_SELECT_KEY( UNARY_EXP, BF16, BF16 ):
|
||||
#endif
|
||||
case _PACK_SELECT_KEY( UNARY_LOG, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_SELU, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_NEG, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ):
|
||||
#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
|
||||
case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ):
|
||||
#endif
|
||||
case _PACK_SELECT_KEY( UNARY_CELU, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_RCP, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_SIGN, BF16, BF16 ):
|
||||
|
|
@ -317,6 +296,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
|
|||
case _PACK_SELECT_KEY( UNARY_ATANH, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_ACOSH, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_INVERSE_SIGMOID, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_TAN, BF16, BF16 ):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
|
|
@ -614,16 +594,22 @@ OnError:
|
|||
} \
|
||||
REGISTER_BACKEND_EVIS( KERNEL_NAME, _##KERNEL_NAME##_setup )
|
||||
|
||||
#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sin, UNARY_SIN )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( cos, UNARY_COS )
|
||||
#endif
|
||||
#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( exp, UNARY_EXP )
|
||||
#endif
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( log, UNARY_LOG )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND )
|
||||
#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU )
|
||||
#endif
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( selu, UNARY_SELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( rcp, UNARY_RCP )
|
||||
|
|
@ -633,5 +619,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atan, UNARY_ATAN )
|
|||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atanh, UNARY_ATANH )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( acosh, UNARY_ACOSH )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( inverse_sigmoid, UNARY_INVERSE_SIGMOID )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( tan, UNARY_TAN )
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -145,41 +145,10 @@ DEF_KERNEL_INITIALIZER(_erf_initializer)
|
|||
|
||||
out_shape = attr[1]->shape;
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[0]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
inputScale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
inputScale = attr[0]->asymm.scale;
|
||||
inputTail = 0 - attr[0]->asymm.zero_point * inputScale;
|
||||
}
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[1]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
outputScale = (float)((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = (float)1.0f / (float) ((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
outputScale = (float)1.0f / attr[1]->asymm.scale;
|
||||
outputZP = (float)attr[1]->asymm.zero_point;
|
||||
}
|
||||
inputScale = attr[0]->scale;
|
||||
inputTail = 0 - (float)attr[0]->zero_point * inputScale;
|
||||
outputScale = (float)1.0f / attr[1]->scale;
|
||||
outputZP = (float)attr[1]->zero_point;
|
||||
|
||||
#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \
|
||||
( ( IN_TYPE << 16) | ( OUT_TYPE << 8))
|
||||
|
|
|
|||
|
|
@ -129,9 +129,6 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
|
|||
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
|
||||
vsi_size_array_t *output_shape = NULL;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = F16;
|
||||
int32_t input0_fl = 0;
|
||||
int32_t input1_fl = 0;
|
||||
int32_t output_fl = 0;
|
||||
float inScale0 = 1.0f;
|
||||
float inScale1 = 1.0f;
|
||||
float outScale = 1.0f;
|
||||
|
|
@ -169,59 +166,12 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
|
|||
(output_shape->data[2] + gpu_param.global_scale[2] - 1)
|
||||
/ gpu_param.global_scale[2] : 1;
|
||||
|
||||
if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
input0_fl = input0_attr->dfp.fl;
|
||||
if (input0_fl > 0)
|
||||
{
|
||||
inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inScale0 = (float)((int64_t)1 << -input0_fl);
|
||||
}
|
||||
}
|
||||
else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
inScale0 = input0_attr->asymm.scale;
|
||||
in0Tail = -inScale0 * ((float)input0_attr->asymm.zero_point);
|
||||
}
|
||||
|
||||
if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
input1_fl = input1_attr->dfp.fl;
|
||||
if (input1_fl > 0)
|
||||
{
|
||||
inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inScale1 = (float)((int64_t)1 << -input1_fl);
|
||||
}
|
||||
}
|
||||
else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
inScale1 = input1_attr->asymm.scale;
|
||||
in1Tail = -inScale1 * ((float)input1_attr->asymm.zero_point);
|
||||
}
|
||||
|
||||
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
output_fl = output_attr->dfp.fl;
|
||||
if (output_fl > 0)
|
||||
{
|
||||
outScale = (float) ((int64_t)1 << output_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outScale = 1.0f / (float)((int64_t)1 << -output_fl);
|
||||
}
|
||||
}
|
||||
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
outScale = 1.0f / output_attr->asymm.scale;
|
||||
outZp = (float)(output_attr->asymm.zero_point);
|
||||
}
|
||||
inScale0 = input0_attr->scale;
|
||||
in0Tail = 0 - inScale0 * ((float)input0_attr->zero_point);
|
||||
inScale1 = input1_attr->scale;
|
||||
in1Tail = 0 - inScale1 * ((float)input1_attr->zero_point);
|
||||
outScale = 1.0f / output_attr->scale;
|
||||
outZp = (float)(output_attr->zero_point);
|
||||
|
||||
if (BF16 == input0_dtype)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#if !(VX_TENSOR_GATHER_API_SUPPORT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -202,6 +202,7 @@ static vx_param_description_t _gather_kernel_param_def[] =
|
|||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _GATHER_PARAM_NUM _cnt_of_array( _gather_kernel_param_def )
|
||||
|
|
@ -285,6 +286,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
|
|||
int32_t indices_num = 1;
|
||||
uint32_t input_dims1 = 0;
|
||||
int32_t batch = 1;
|
||||
int32_t is_array = 0;
|
||||
vx_uint32 i = 0;
|
||||
vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
|
||||
vsi_size_array_t * input1_shape = NULL;
|
||||
|
|
@ -308,40 +310,13 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &is_array);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
src0Scale = attr[0]->asymm.scale;
|
||||
src0ZP = attr[0]->asymm.zero_point;
|
||||
}
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[2]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
dstScale = 1.0f / attr[2]->asymm.scale;
|
||||
dstZP = attr[2]->asymm.zero_point;
|
||||
}
|
||||
src0Scale = attr[0]->scale;
|
||||
src0ZP = attr[0]->zero_point;
|
||||
dstScale = 1.0f / attr[2]->scale;
|
||||
dstZP = attr[2]->zero_point;
|
||||
|
||||
input1_shape = attr[1]->shape;
|
||||
input_dims1 = (uint32_t)input1_shape->size;
|
||||
|
|
@ -358,8 +333,16 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
|
|||
}
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.global_size[0] = gpu_align_p2((block_size + shaderParam.global_scale[0] - 1)
|
||||
/ shaderParam.global_scale[0], 4);
|
||||
if (is_array)
|
||||
{
|
||||
shaderParam.global_size[0] = (block_size + shaderParam.global_scale[0] - 1)
|
||||
/ shaderParam.global_scale[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
shaderParam.global_size[0] = gpu_align_p2((block_size + shaderParam.global_scale[0] - 1)
|
||||
/ shaderParam.global_scale[0], 4);
|
||||
}
|
||||
shaderParam.global_size[1] = indices_num;
|
||||
shaderParam.global_size[2] = block_num;
|
||||
|
||||
|
|
@ -508,39 +491,10 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
|
|||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
src0Scale = attr[0]->asymm.scale;
|
||||
src0ZP = attr[0]->asymm.zero_point;
|
||||
}
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[2]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
dstScale = 1.0f / attr[2]->asymm.scale;
|
||||
dstZP = attr[2]->asymm.zero_point;
|
||||
}
|
||||
src0Scale = attr[0]->scale;
|
||||
src0ZP = attr[0]->zero_point;
|
||||
dstScale = 1.0f / attr[2]->scale;
|
||||
dstZP = attr[2]->zero_point;
|
||||
|
||||
input1_shape = attr[1]->shape;
|
||||
input_dims1 = (uint32_t)input1_shape->size;
|
||||
|
|
@ -661,8 +615,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
|
|||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
|
||||
if (indices_num > GPU_TENSOR_MAX_WIDTH || block_num > GPU_TENSOR_MAX_WIDTH)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
OnError:
|
||||
|
|
@ -841,6 +798,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is_array );
|
||||
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _GATHER_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[4] );
|
||||
|
|
@ -859,3 +817,4 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( gather, _setup )
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -290,39 +290,10 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
|
|||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
src0Scale = attr[0]->asymm.scale;
|
||||
src0ZP = attr[0]->asymm.zero_point;
|
||||
}
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[2]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
dstScale = 1.0f / attr[2]->asymm.scale;
|
||||
dstZP = attr[2]->asymm.zero_point;
|
||||
}
|
||||
src0Scale = attr[0]->scale;
|
||||
src0ZP = attr[0]->zero_point;
|
||||
dstScale = 1.0f / attr[2]->scale;
|
||||
dstZP = attr[2]->zero_point;
|
||||
|
||||
indices_num = (int32_t)(attr[1]->shape->data[1]);
|
||||
batch_num = (int32_t)(attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1);
|
||||
|
|
|
|||
|
|
@ -238,7 +238,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
|
|||
float tensorZP[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
uint32_t i = 0;
|
||||
uint32_t pack_key = 0;
|
||||
vsi_size_array_t * output_shape = NULL;
|
||||
vsi_size_array_t * output_shape = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL, NULL, NULL, NULL };
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
|
@ -254,12 +254,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
|
|||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if( attr[i]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[i]->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
tensorZP[i] = (float)attr[i]->asymm.zero_point;
|
||||
tensorScale[i] = attr[i]->asymm.scale;
|
||||
}
|
||||
tensorZP[i] = (float)attr[i]->zero_point;
|
||||
tensorScale[i] = attr[i]->scale;
|
||||
}
|
||||
|
||||
tensorZP[0] = tensorScale[0] * tensorZP[0];
|
||||
|
|
@ -459,63 +455,31 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_cdnn_initializer)
|
|||
|
||||
output_shape = attr[3]->shape;
|
||||
|
||||
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
input_scale = attr[0]->asymm.scale;
|
||||
input_tail = 0 - input_scale * (float)attr[0]->asymm.zero_point;
|
||||
}
|
||||
input_scale = attr[0]->scale;
|
||||
input_tail = 0 - input_scale * (float)attr[0]->zero_point;
|
||||
|
||||
if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
input_r_scale = attr[1]->asymm.scale;
|
||||
input_r_tail = 0 - input_r_scale * (float)attr[1]->asymm.zero_point;
|
||||
}
|
||||
input_r_scale = attr[1]->scale;
|
||||
input_r_tail = 0 - input_r_scale * (float)attr[1]->zero_point;
|
||||
|
||||
if( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
recur_r_scale = attr[2]->asymm.scale;
|
||||
recur_r_tail = 0 - recur_r_scale * (float)attr[2]->asymm.zero_point;
|
||||
}
|
||||
recur_r_scale = attr[2]->scale;
|
||||
recur_r_tail = 0 - recur_r_scale * (float)attr[2]->zero_point;
|
||||
|
||||
if( attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[3]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
output_scale = 1.0f / attr[3]->asymm.scale;
|
||||
output_zp = (float)attr[3]->asymm.zero_point;
|
||||
}
|
||||
output_scale = 1.0f / attr[3]->scale;
|
||||
output_zp = (float)attr[3]->zero_point;
|
||||
|
||||
if ( param_size == _GRUCELL_CDNN_SEP_ACTIVATION_PARAM_NUM )
|
||||
{
|
||||
if( attr[4]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[4]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
input_z_scale = attr[4]->asymm.scale;
|
||||
input_z_tail = 0 - input_z_scale * (float)attr[4]->asymm.zero_point;
|
||||
}
|
||||
input_z_scale = attr[4]->scale;
|
||||
input_z_tail = 0 - input_z_scale * (float)attr[4]->zero_point;
|
||||
|
||||
if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
recur_z_scale = attr[5]->asymm.scale;
|
||||
recur_z_tail = 0 - recur_z_scale * (float)attr[5]->asymm.zero_point;
|
||||
}
|
||||
recur_z_scale = attr[5]->scale;
|
||||
recur_z_tail = 0 - recur_z_scale * (float)attr[5]->zero_point;
|
||||
|
||||
if( attr[6]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[6]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
input_c_scale = attr[6]->asymm.scale;
|
||||
input_c_tail = 0 - input_c_scale * (float)attr[6]->asymm.zero_point;
|
||||
}
|
||||
input_c_scale = attr[6]->scale;
|
||||
input_c_tail = 0 - input_c_scale * (float)attr[6]->zero_point;
|
||||
|
||||
if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
recur_c_scale = attr[7]->asymm.scale;
|
||||
recur_c_tail = 0 - recur_c_scale * (float)attr[7]->asymm.zero_point;
|
||||
}
|
||||
recur_c_scale = attr[7]->scale;
|
||||
recur_c_tail = 0 - recur_c_scale * (float)attr[7]->zero_point;
|
||||
}
|
||||
|
||||
if (layer_out == 1 || layer_out == 2)
|
||||
|
|
|
|||
|
|
@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
|
|||
float hstate_in_tail = 0;
|
||||
float output_scale = 1.0f;
|
||||
float output_zp = 0;
|
||||
float output_scale1 = 1.0f;
|
||||
float output_zp1 = 0;
|
||||
uint32_t i = 0;
|
||||
uint32_t pack_key = 0;
|
||||
vsi_nn_kernel_tensor_attr_t* input_attr[GRUCELL_ACT_Z_H_IN_CNT] = {NULL};
|
||||
|
|
@ -142,33 +144,14 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
|
|||
output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
|
||||
{
|
||||
int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
|
||||
if (srcFixPointPos >= 0)
|
||||
hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
|
||||
else if (srcFixPointPos < 0)
|
||||
hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
|
||||
{
|
||||
hstate_in_scale = input_attr[0]->asymm.scale;
|
||||
hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
|
||||
}
|
||||
hstate_in_scale = input_attr[0]->scale;
|
||||
hstate_in_tail = -(float)input_attr[0]->zero_point * hstate_in_scale;
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant )
|
||||
{
|
||||
int8_t dstFixPointPos = (int8_t)output_attr[0]->dfp.fl;
|
||||
if (dstFixPointPos >= 0)
|
||||
output_scale *= (vx_float32)((int64_t)1 << dstFixPointPos);
|
||||
else if (dstFixPointPos < 0)
|
||||
output_scale *= 1.0f / (vx_float32) ((int64_t)1 << - dstFixPointPos);
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
|
||||
{
|
||||
output_scale = 1.0f / output_attr[0]->asymm.scale;
|
||||
output_zp = (float)output_attr[0]->asymm.zero_point;
|
||||
}
|
||||
output_scale = 1.0f / output_attr[0]->scale;
|
||||
output_zp = (float)output_attr[0]->zero_point;
|
||||
|
||||
output_scale1 = 1.0f / output_attr[1]->scale;
|
||||
output_zp1 = (float)output_attr[1]->zero_point;
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);
|
||||
|
||||
|
|
@ -290,6 +273,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_tail", &hstate_in_tail);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_scale1", &output_scale1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp1", &output_zp1);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -132,19 +132,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
|
|||
output_attr[0] = vsi_nn_kernel_tensor_attr_create( output );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr[0], "Create tensor attr buffer fail.", final );
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
|
||||
{
|
||||
int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
|
||||
if (srcFixPointPos >= 0)
|
||||
hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
|
||||
else if (srcFixPointPos < 0)
|
||||
hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
|
||||
{
|
||||
hstate_in_scale = input_attr[0]->asymm.scale;
|
||||
hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
|
||||
}
|
||||
hstate_in_scale = input_attr[0]->scale;
|
||||
hstate_in_tail = 0 - (float)input_attr[0]->zero_point * hstate_in_scale;
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);
|
||||
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ typedef enum _grucell_nn_activation_type_e
|
|||
SIGMOID = VSI_NN_ACT_SIGMOID,
|
||||
HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
|
||||
TANH = VSI_NN_ACT_TANH,
|
||||
RELU = VSI_NN_ACT_RELU,
|
||||
}grucell_nn_activation_type_e;
|
||||
|
||||
#define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE "grucell_reset_after_activation"
|
||||
|
|
@ -80,6 +81,11 @@ static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
|
|||
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ),
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, SIGMOID ),
|
||||
PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, RELU ),
|
||||
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, RELU ),
|
||||
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, RELU ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, RELU ),
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, RELU ),
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -148,33 +154,11 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
|
|||
output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
|
||||
{
|
||||
int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
|
||||
if (srcFixPointPos >= 0)
|
||||
hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
|
||||
else if (srcFixPointPos < 0)
|
||||
hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
|
||||
{
|
||||
hstate_in_scale = input_attr[0]->asymm.scale;
|
||||
hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
|
||||
}
|
||||
hstate_in_scale = input_attr[0]->scale;
|
||||
hstate_in_tail = -(float)input_attr[0]->zero_point * hstate_in_scale;
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant )
|
||||
{
|
||||
int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
|
||||
if (srcFixPointPos >= 0)
|
||||
output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos);
|
||||
else if (srcFixPointPos < 0)
|
||||
output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos);
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
|
||||
{
|
||||
output_scale = 1.0f / output_attr[0]->asymm.scale;
|
||||
output_zp = (float)output_attr[0]->asymm.zero_point;
|
||||
}
|
||||
output_scale = 1.0f / output_attr[0]->scale;
|
||||
output_zp = (float)output_attr[0]->zero_point;
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);
|
||||
|
||||
|
|
|
|||
|
|
@ -127,10 +127,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
|
|||
vsi_size_array_t * output_shape = NULL;
|
||||
vsi_nn_kernel_dtype_e input_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e output_dtype = F16;
|
||||
int32_t input_fl = 0;
|
||||
int32_t inputZP = 0;
|
||||
float inputScale = 1.0f;
|
||||
int32_t output_fl = 0;
|
||||
int32_t outputZP = 0;
|
||||
float outputScale = 1.0f;
|
||||
float r_inputScale = 1.0f;
|
||||
|
|
@ -153,41 +151,11 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
|
|||
input_dtype = input_attr->dtype;
|
||||
output_dtype = output_attr->dtype;
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_DFP == input_attr->quant )
|
||||
{
|
||||
input_fl = input_attr->dfp.fl;
|
||||
if (input_fl >= 0)
|
||||
{
|
||||
inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = (float) ((int64_t)1 << -input_fl);
|
||||
}
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
|
||||
{
|
||||
inputZP = input_attr->asymm.zero_point;
|
||||
inputScale = input_attr->asymm.scale;
|
||||
}
|
||||
inputZP = input_attr->zero_point;
|
||||
inputScale = input_attr->scale;
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant )
|
||||
{
|
||||
output_fl = output_attr->dfp.fl;
|
||||
if (output_fl >= 0)
|
||||
{
|
||||
outputScale = (float) ((int64_t)1 << output_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = 1.0f / (float) ((int64_t)1 << -output_fl);
|
||||
}
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
|
||||
{
|
||||
outputZP = output_attr->asymm.zero_point;
|
||||
outputScale = 1.0f / output_attr->asymm.scale;
|
||||
}
|
||||
outputZP = output_attr->zero_point;
|
||||
outputScale = 1.0f / output_attr->scale;
|
||||
|
||||
e2InScale = inputScale * inputScale;
|
||||
r_inputScale = 1.0f / inputScale;
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -42,7 +43,11 @@ __BEGIN_DECLS
|
|||
#define SOURCE_AXIS0_1 "layer_normalization_1"
|
||||
#define SOURCE_AXIS0_2 "layer_normalization_2"
|
||||
#define SOURCE_AXIS0_3 "layer_normalization_3"
|
||||
#define SOURCE_AXIS01 "layer_normalization_axis01"
|
||||
#define SOURCE_AXIS01_SUM "layer_normalization_axis01_sum"
|
||||
#define SOURCE_AXIS01_0 "layer_normalization_axis01_0"
|
||||
#define SOURCE_AXIS01_1 "layer_normalization_axis01_1"
|
||||
#define SOURCE_AXIS01_2 "layer_normalization_axis01_2"
|
||||
#define SOURCE_AXIS01_3 "layer_normalization_axis01_3"
|
||||
|
||||
#define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, SCALE_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.layer_norm_axis0_"#SRC0_TYPE"_"#SCALE_TYPE"to"#DST_TYPE)
|
||||
|
|
@ -88,15 +93,15 @@ __BEGIN_DECLS
|
|||
#define HASH_LN_AXIS01_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.layernorm_axis01_"#SRC0_TYPE"_"#SRC1_TYPE"to"#DST_TYPE)
|
||||
|
||||
#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE) \
|
||||
#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, U4, OUT_TYPE, 0), \
|
||||
HASH_LN_AXIS01_SUMS_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE_AXIS01 },
|
||||
SOURCE },
|
||||
|
||||
#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
|
||||
#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
|
||||
HASH_LN_AXIS01_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
|
||||
SOURCE_AXIS01 },
|
||||
SOURCE },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
|
@ -159,32 +164,32 @@ static const _kernel_map_type _layernorm_kernel_map[] =
|
|||
static const _kernel_map_type _layernorm_axis01_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
LN_AXIS01_SUMS_KERNELS( I8, F32 )
|
||||
LN_AXIS01_SUMS_KERNELS( U8, F32 )
|
||||
LN_AXIS01_SUMS_KERNELS( F16, F32 )
|
||||
LN_AXIS01_SUMS_KERNELS( I16, F32 )
|
||||
LN_AXIS01_SUMS_KERNELS( I8, F32, SOURCE_AXIS01_SUM )
|
||||
LN_AXIS01_SUMS_KERNELS( U8, F32, SOURCE_AXIS01_SUM )
|
||||
LN_AXIS01_SUMS_KERNELS( F16, F32, SOURCE_AXIS01_SUM )
|
||||
LN_AXIS01_SUMS_KERNELS( I16, F32, SOURCE_AXIS01_SUM )
|
||||
|
||||
LAYERNORM_AXIS01_KERNELS( U8, F16, U8 )
|
||||
LAYERNORM_AXIS01_KERNELS( U8, F16, F16 )
|
||||
LAYERNORM_AXIS01_KERNELS( I8, F16, I8 )
|
||||
LAYERNORM_AXIS01_KERNELS( I8, F16, F16 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F16, F16 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F16, I16 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F16, I8 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F16, U8 )
|
||||
LAYERNORM_AXIS01_KERNELS( I16, F16, I16 )
|
||||
LAYERNORM_AXIS01_KERNELS( I16, F16, F16 )
|
||||
LAYERNORM_AXIS01_KERNELS( U8, F16, U8, SOURCE_AXIS01_0 )
|
||||
LAYERNORM_AXIS01_KERNELS( U8, F16, F16, SOURCE_AXIS01_0 )
|
||||
LAYERNORM_AXIS01_KERNELS( I8, F16, I8, SOURCE_AXIS01_1 )
|
||||
LAYERNORM_AXIS01_KERNELS( I8, F16, F16, SOURCE_AXIS01_1 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F16, F16, SOURCE_AXIS01_2 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F16, I16, SOURCE_AXIS01_2 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F16, I8, SOURCE_AXIS01_2 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F16, U8, SOURCE_AXIS01_2 )
|
||||
LAYERNORM_AXIS01_KERNELS( I16, F16, I16, SOURCE_AXIS01_3 )
|
||||
LAYERNORM_AXIS01_KERNELS( I16, F16, F16, SOURCE_AXIS01_3 )
|
||||
|
||||
LAYERNORM_AXIS01_KERNELS( U8, F32, U8 )
|
||||
LAYERNORM_AXIS01_KERNELS( U8, F32, F16 )
|
||||
LAYERNORM_AXIS01_KERNELS( I8, F32, I8 )
|
||||
LAYERNORM_AXIS01_KERNELS( I8, F32, F16 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F32, F16 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F32, I16 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F32, I8 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F32, U8 )
|
||||
LAYERNORM_AXIS01_KERNELS( I16, F32, I16 )
|
||||
LAYERNORM_AXIS01_KERNELS( I16, F32, F16 )
|
||||
LAYERNORM_AXIS01_KERNELS( U8, F32, U8, SOURCE_AXIS01_0 )
|
||||
LAYERNORM_AXIS01_KERNELS( U8, F32, F16, SOURCE_AXIS01_0 )
|
||||
LAYERNORM_AXIS01_KERNELS( I8, F32, I8, SOURCE_AXIS01_1 )
|
||||
LAYERNORM_AXIS01_KERNELS( I8, F32, F16, SOURCE_AXIS01_1 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F32, F16, SOURCE_AXIS01_2 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F32, I16, SOURCE_AXIS01_2 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F32, I8, SOURCE_AXIS01_2 )
|
||||
LAYERNORM_AXIS01_KERNELS( F16, F32, U8, SOURCE_AXIS01_2 )
|
||||
LAYERNORM_AXIS01_KERNELS( I16, F32, I16, SOURCE_AXIS01_3 )
|
||||
LAYERNORM_AXIS01_KERNELS( I16, F32, F16, SOURCE_AXIS01_3 )
|
||||
|
||||
};
|
||||
|
||||
|
|
@ -1165,3 +1170,4 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( layer_norm, _setup )
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#if !(VX_LOGSOFTMAX_VX_SUPPORT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -34,15 +35,21 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
|
||||
((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
|
||||
|
||||
|
||||
#define HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) \
|
||||
"log_softmax_axis"#_suffix
|
||||
|
||||
#define HASH_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_suffix) \
|
||||
"log_softmax_exceed_axis"#_suffix
|
||||
|
||||
|
||||
#define HASH_LOG_SOFTMAX_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, _suffix) \
|
||||
{ HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
|
||||
CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \
|
||||
|
|
@ -53,11 +60,18 @@ __BEGIN_DECLS
|
|||
CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \
|
||||
HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) },
|
||||
|
||||
static const struct {
|
||||
#define HASH_LOG_SOFTMAX_EXCEED_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, _suffix) \
|
||||
{ HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
|
||||
CVIVANTE_NAMESPACE("evis.log_softmax_exceed_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_suffix) },
|
||||
|
||||
typedef struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
const char* source_name;
|
||||
} _log_softmax_evis_kernel_map[] =
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _log_softmax_evis_kernel_map[] =
|
||||
{
|
||||
HASH_LOG_SOFTMAX_KERNELS(0, F16, F16, 0)
|
||||
HASH_LOG_SOFTMAX_KERNELS(0, F16, I16, 0)
|
||||
|
|
@ -126,6 +140,49 @@ static const struct {
|
|||
|
||||
};
|
||||
|
||||
static const _kernel_map_type _log_softmax_exceed_evis_kernel_map[] =
|
||||
{
|
||||
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, F16, 0)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, I16, 0)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, U8, 0)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, I8, 0)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I16, I16, 0)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I16, F16, 0)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, BF16, 0_BF16)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, F32, 0_BF16)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, F16, 0_BF16)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, U8, U8, 0)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, U8, F16, 0)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I8, I8, 0)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I8, F16, 0)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, F16, 1)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, I16, 1)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, U8, 1)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, I8, 1)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I16, I16, 1)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I16, F16, 1)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, BF16, 1_BF16)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, F32, 1_BF16)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, F16, 1_BF16)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, U8, U8, 1)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, U8, F16, 1)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I8, I8, 1)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I8, F16, 1)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, F16, 2)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, I16, 2)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, U8, 2)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, I8, 2)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I16, I16, 2)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I16, F16, 2)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, BF16, BF16, 2)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, U8, U8, 2)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, U8, F16, 2)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I8, I8, 2)
|
||||
HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I8, F16, 2)
|
||||
|
||||
};
|
||||
|
||||
static vx_param_description_t kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
|
|
@ -133,7 +190,9 @@ static vx_param_description_t kernel_param_def[] =
|
|||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def)
|
||||
|
||||
|
||||
#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def)
|
||||
|
||||
#define SCALAR_INPUT_AXIS (2)
|
||||
#define SCALAR_INPUT_BETA (3)
|
||||
|
|
@ -157,7 +216,7 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
|
|||
float beta = 0;
|
||||
float input_scale = 0;
|
||||
float output_scale = 0;
|
||||
int32_t outputZP = 0;
|
||||
float outputZP = 0;
|
||||
uint32_t inputWidth = 0;
|
||||
uint32_t inputWidthRemain4 = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL };
|
||||
|
|
@ -385,62 +444,25 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
|
|||
}
|
||||
}
|
||||
|
||||
outputZP = (float)attr[1]->zero_point;
|
||||
output_scale = 1.0f / (float)(attr[1]->scale);
|
||||
|
||||
if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[1]->dfp.fl;
|
||||
|
||||
if (fl > 0)
|
||||
{
|
||||
output_scale = (float)((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = (float)1.0f / (float) ((int64_t)1 << -fl);
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"outputScale", &output_scale );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
float output_offset_asymmetric = 0;
|
||||
outputZP = attr[1]->asymm.zero_point;
|
||||
output_scale = 1.0f / (float)(attr[1]->asymm.scale);
|
||||
output_offset_asymmetric = (float)outputZP;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"outputScale", &output_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"output_offset_asymmetric", &output_offset_asymmetric );
|
||||
"output_offset_asymmetric", &outputZP );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = 1;
|
||||
outputZP = 0;
|
||||
}
|
||||
|
||||
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[0]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input_scale = attr[0]->asymm.scale;
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = 1.0f;
|
||||
}
|
||||
input_scale = attr[0]->scale;
|
||||
|
||||
scaleLogE = scaleLogE * input_scale;
|
||||
beta = beta * input_scale;
|
||||
|
|
@ -471,6 +493,296 @@ final:
|
|||
return status;
|
||||
} /* _log_softmax_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_log_softmax_exceed_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
int32_t axis = 0;
|
||||
float beta = 0;
|
||||
float input_scale = 0;
|
||||
float output_scale = 0;
|
||||
float outputZP = 0;
|
||||
uint32_t inputWidth = 0;
|
||||
uint32_t inputWidthRemain4 = 0;
|
||||
int32_t width = 0;
|
||||
int32_t height = 0;
|
||||
int32_t depth = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL };
|
||||
vsi_size_array_t * output_shape = NULL;
|
||||
float logE = (float)(log10(exp(1.0f)) / log10(2.0f));
|
||||
float rlogE = (float)(log10(2.0f) / log10(exp(1.0f)));
|
||||
float scaleLogE = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &beta);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
scaleLogE = logE * beta;
|
||||
|
||||
output_shape = attr[1]->shape;
|
||||
width = (int32_t)output_shape->data[0];
|
||||
height = (int32_t)output_shape->data[1];
|
||||
depth = output_shape->size > 2 ? (int32_t)output_shape->data[2] : 1;
|
||||
gpu_param.dim = 2;
|
||||
switch (axis)
|
||||
{
|
||||
case 0:
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_size[0] = 1;
|
||||
gpu_param.global_size[1] = depth;
|
||||
break;
|
||||
case 1:
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_size[0] =
|
||||
gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = 1;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniGetSubData0to3_4x4 = {{
|
||||
0x09090909, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniGetSubData4to7_4x4 = {{
|
||||
0x09090909, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniPackMaxData_2x8 = {{
|
||||
0x00000111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00050300, 0x00000000, // ABin
|
||||
0x00000222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00004400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x05050404, 0x07070606, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractHalf4_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00020000, 0x00060004, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniGetSubLoData_4x4 = {{
|
||||
0x09090909, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00110000, 0x00330022, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniGetSubHiData_4x4 = {{
|
||||
0x09090909, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00550044, 0x00770066, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
switch( axis )
|
||||
{
|
||||
case 0:
|
||||
{
|
||||
inputWidth = (uint32_t)(output_shape->data[axis] / 4 * 4);
|
||||
inputWidthRemain4 = (uint32_t)(output_shape->data[axis] % 4);
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"inputWidth", &inputWidth );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"inputWidthRemain4", &inputWidthRemain4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniPackMaxData_2x8", &uniPackMaxData_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "axisSize", &width );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "height", &height);
|
||||
if (attr[0]->dtype == BF16)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtractHalf4_4x4", &uniExtractHalf4_4x4 );
|
||||
}
|
||||
else
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniGetSubData0to3_4x4", &uniGetSubData0to3_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniGetSubData4to7_4x4", &uniGetSubData4to7_4x4 );
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
{
|
||||
if (attr[0]->dtype == BF16)
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniGetSubLoData_4x4", &uniGetSubLoData_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniGetSubHiData_4x4", &uniGetSubHiData_4x4 );
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "axisSize", &height );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
outputZP = (float)attr[1]->zero_point;
|
||||
output_scale = 1.0f / attr[1]->scale;
|
||||
|
||||
if (attr[0]->dtype != BF16)
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"outputScale", &output_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"output_offset_asymmetric", &outputZP );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
input_scale = attr[0]->scale;
|
||||
|
||||
scaleLogE = scaleLogE * input_scale;
|
||||
beta = beta * input_scale;
|
||||
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"rlogE", &rlogE );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"betaValue", &beta );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"scaleLogE", &scaleLogE );
|
||||
|
||||
status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
}
|
||||
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
}
|
||||
|
||||
return status;
|
||||
|
||||
}
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
|
|
@ -513,7 +825,51 @@ static vsi_status _query_kernel
|
|||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
static vsi_status _query_kernel_exceed
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
int32_t axis,
|
||||
vsi_nn_kernel_t* kernel
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_dtype_e input_dtype;
|
||||
vsi_nn_kernel_dtype_e output_dtype;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
uint32_t key;
|
||||
size_t i;
|
||||
|
||||
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_LOG_SOFTMAX_HASH_KEY( axis, input_dtype, output_dtype, 0 );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(_log_softmax_exceed_evis_kernel_map); i ++ )
|
||||
{
|
||||
if( _log_softmax_exceed_evis_kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( i < _cnt_of_array(_log_softmax_exceed_evis_kernel_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _log_softmax_exceed_evis_kernel_map[i].function_name );
|
||||
kernel->info.parameters = kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( kernel_param_def );
|
||||
kernel->info.initialize = _log_softmax_exceed_initializer;
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
_log_softmax_exceed_evis_kernel_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
_log_softmax_exceed_evis_kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static vsi_nn_kernel_node_t _setup_not_exceed
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
|
|
@ -528,7 +884,13 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL};
|
||||
vsi_bool image_2d = FALSE;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
|
||||
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
|
||||
uint32_t rank_in = 0;
|
||||
int32_t axis = 0;
|
||||
int32_t new_axis = 0;
|
||||
vsi_bool ret = vx_false_e;
|
||||
uint32_t i = 0;
|
||||
float beta = 1.0f;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
|
|
@ -537,15 +899,31 @@ static vsi_nn_kernel_node_t _setup
|
|||
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
beta = vsi_nn_kernel_param_get_float32(params, "beta");
|
||||
|
||||
if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num )
|
||||
|| axis > 2)
|
||||
ret = vsi_nn_kernel_optimize_softmax_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
||||
shapes[0], &rank_in, &new_axis);
|
||||
|
||||
if (ret)
|
||||
{
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], rank_in );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[0], rank_in );
|
||||
}
|
||||
else
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
|
||||
status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
|
||||
if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
|
||||
reshape_tensors[0]->attr.dim_num )
|
||||
|| new_axis > 2)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1);
|
||||
status = _query_kernel( inputs, outputs, new_axis, image_2d, kernel );
|
||||
if( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -553,9 +931,9 @@ static vsi_nn_kernel_node_t _setup
|
|||
{
|
||||
/* Pass parameters to node. */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM,
|
||||
inputs, 1, outputs, 1 );
|
||||
reshape_tensors, 1, &reshape_tensors[1], 1 );
|
||||
node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &axis );
|
||||
graph, I32, &new_axis );
|
||||
node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &beta );
|
||||
|
||||
|
|
@ -565,10 +943,132 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_BETA] );
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup_exceed
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
|
||||
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
|
||||
uint32_t rank_in = 0;
|
||||
int32_t axis = 0;
|
||||
int32_t new_axis = 0;
|
||||
vsi_bool ret = vx_false_e;
|
||||
uint32_t i = 0;
|
||||
float beta = 1.0f;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
beta = vsi_nn_kernel_param_get_float32(params, "beta");
|
||||
|
||||
ret = vsi_nn_kernel_optimize_softmax_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
||||
shapes[0], &rank_in, &new_axis);
|
||||
|
||||
if (ret)
|
||||
{
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], rank_in );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[0], rank_in );
|
||||
}
|
||||
else
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
|
||||
reshape_tensors[0]->attr.dim_num )
|
||||
|| new_axis > 1)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel_exceed(inputs, outputs, new_axis, kernel);
|
||||
if( VSI_SUCCESS != status)
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
CHECK_PTR_FAIL_GOTO( node, "Create kernel fail.", final );
|
||||
if (node)
|
||||
{
|
||||
vsi_nn_kernel_node_pack_io(node_params, _EVIS_PARAM_NUM,
|
||||
reshape_tensors,
|
||||
input_num,
|
||||
&reshape_tensors[1],
|
||||
output_num);
|
||||
node_params[2] = vsi_nn_kernel_scalar_create(graph, I32, &new_axis );
|
||||
node_params[3] = vsi_nn_kernel_scalar_create(graph, F32, &beta );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param(
|
||||
node, node_params, _EVIS_PARAM_NUM);
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||
}
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_size_t *input_size = inputs[0]->attr.size;
|
||||
int32_t axis = 0;
|
||||
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
|
||||
if (input_size[axis] >= GPU_TENSOR_MAX_WIDTH)
|
||||
{
|
||||
node = _setup_exceed(graph, inputs, input_num, outputs, output_num, params, kernel);
|
||||
}
|
||||
else
|
||||
{
|
||||
node = _setup_not_exceed(graph, inputs, input_num, outputs, output_num, params, kernel);
|
||||
}
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( log_softmax, _setup )
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -996,18 +996,14 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
|
|||
float forget_bias = 0.0f;
|
||||
float outputScale = 1.0f;
|
||||
float outputZP = 0;
|
||||
int32_t dstZP = 0;
|
||||
float dstScale = 1.0f;
|
||||
vsi_nn_kernel_dtype_e cellFormat = F16;
|
||||
vsi_nn_kernel_dtype_e dstFormat = F16;
|
||||
vsi_nn_kernel_quant_type_e dstQuantType = VSI_NN_KERNEL_QUANT_NONE;
|
||||
int32_t dstFixPointPos = 0;
|
||||
float logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f));
|
||||
float logE = (float)(log10(exp(1.0f)) / log10(2.0f));
|
||||
float twoLogE = 2 * logE;
|
||||
uint32_t uint_min = 0xFBFFFFFF;
|
||||
uint32_t uint_max = 0x7BFFFFFF;
|
||||
float float_min = *(vx_float32 *)&uint_min;
|
||||
float float_max = *(vx_float32 *)&uint_max;
|
||||
float float_min = *(float *)&uint_min;
|
||||
float float_max = *(float *)&uint_max;
|
||||
float clip_Min_F[4] = {0};
|
||||
float clip_Max_F[4] = {0};
|
||||
uint32_t i = 0;
|
||||
|
|
@ -1063,22 +1059,11 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
|
|||
status = vsi_nn_kernel_scalar_read_float32( (vsi_nn_kernel_scalar_t)param[param_size - 1], &forget_bias );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
cellFormat = attr[0]->dtype;
|
||||
dstFormat = attr[1]->dtype;
|
||||
cellFormat = attr[0]->dtype;
|
||||
dstFormat = attr[1]->dtype;
|
||||
|
||||
dstQuantType = attr[1]->quant;
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_DFP == dstQuantType )
|
||||
{
|
||||
dstFixPointPos = (int8_t)attr[1]->dfp.fl;
|
||||
}
|
||||
else if ( VSI_NN_KERNEL_QUANT_ASYMM == dstQuantType )
|
||||
{
|
||||
dstZP = attr[1]->asymm.zero_point;
|
||||
dstScale = attr[1]->asymm.scale;
|
||||
}
|
||||
|
||||
outputZP = (vx_float32)dstZP;
|
||||
outputScale = 1.0f / attr[1]->scale;
|
||||
outputZP = (float)attr[1]->zero_point;
|
||||
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
|
|
@ -1182,20 +1167,6 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
|
|||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
if (dstQuantType == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (dstFixPointPos >= 0)
|
||||
outputScale *= (vx_float32)((int64_t)1 << dstFixPointPos);
|
||||
else if (dstFixPointPos < 0)
|
||||
outputScale *= 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
|
||||
|
||||
outputZP = 0;
|
||||
}
|
||||
else if (dstQuantType == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
outputScale = 1.0f / dstScale;
|
||||
}
|
||||
|
||||
if ( cellFormat == F16 )
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4);
|
||||
|
|
|
|||
|
|
@ -288,67 +288,13 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
|
|||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &K);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
src0ZP = attr[0]->asymm.zero_point;
|
||||
src0Scale = attr[0]->asymm.scale;
|
||||
src1ZP = attr[1]->asymm.zero_point;
|
||||
src1Scale = attr[1]->asymm.scale;
|
||||
dstZP = (float)attr[2]->asymm.zero_point;
|
||||
dstScale = attr[2]->asymm.scale;
|
||||
src0ZP = attr[0]->zero_point;
|
||||
src0Scale = attr[0]->scale;
|
||||
src1ZP = attr[1]->zero_point;
|
||||
src1Scale = attr[1]->scale;
|
||||
dstZP = (float)attr[2]->zero_point;
|
||||
dstScale = attr[2]->scale;
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
src0ZP = 0;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
|
||||
{
|
||||
src0Scale = 1;
|
||||
src0ZP = 0;
|
||||
}
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[1]->dfp.fl > 0)
|
||||
{
|
||||
src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
|
||||
}
|
||||
src1ZP = 0;
|
||||
}
|
||||
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
|
||||
{
|
||||
src1Scale = 1;
|
||||
src1ZP = 0;
|
||||
}
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[2]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
|
||||
}
|
||||
dstScale = 1.0f / dstScale;
|
||||
dstZP = 0.0f;
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
dstScale = 1;
|
||||
dstZP = 0.0f;
|
||||
}
|
||||
gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postShift0);
|
||||
gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postShift1);
|
||||
|
||||
|
|
@ -1266,67 +1212,12 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_cross_initializer)
|
|||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &axis_size);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
src0ZP = attr[0]->asymm.zero_point;
|
||||
src0Scale = attr[0]->asymm.scale;
|
||||
src1ZP = attr[1]->asymm.zero_point;
|
||||
src1Scale = attr[1]->asymm.scale;
|
||||
dstZP = (float)attr[2]->asymm.zero_point;
|
||||
dstScale = attr[2]->asymm.scale;
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
src0ZP = 0;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
|
||||
{
|
||||
src0Scale = 1;
|
||||
src0ZP = 0;
|
||||
}
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[1]->dfp.fl > 0)
|
||||
{
|
||||
src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
|
||||
}
|
||||
src1ZP = 0;
|
||||
}
|
||||
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
|
||||
{
|
||||
src1Scale = 1;
|
||||
src1ZP = 0;
|
||||
}
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[2]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
|
||||
}
|
||||
dstScale = 1.0f / dstScale;
|
||||
dstZP = 0.0f;
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
dstScale = 1;
|
||||
dstZP = 0.0f;
|
||||
}
|
||||
src0ZP = attr[0]->zero_point;
|
||||
src0Scale = attr[0]->scale;
|
||||
src1ZP = attr[1]->zero_point;
|
||||
src1Scale = attr[1]->scale;
|
||||
dstZP = (float)attr[2]->zero_point;
|
||||
dstScale = attr[2]->scale;
|
||||
|
||||
mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP);
|
||||
inOutScale = src0Scale * src1Scale / dstScale;
|
||||
|
|
|
|||
|
|
@ -163,63 +163,12 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
|
|||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = attr[2]->shape;
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[0]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input0_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input0_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
input0_zp = attr[0]->asymm.zero_point;
|
||||
input0_scale = attr[0]->asymm.scale;
|
||||
}
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[1]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input1_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input1_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
input1_zp = attr[1]->asymm.zero_point;
|
||||
input1_scale = attr[1]->asymm.scale;
|
||||
}
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[2]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
output_scale = (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = 1.0f / (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
output_zp = attr[2]->asymm.zero_point;
|
||||
output_scale = 1.0f / attr[2]->asymm.scale;
|
||||
}
|
||||
input0_zp = attr[0]->zero_point;
|
||||
input0_scale = attr[0]->scale;
|
||||
input1_zp = attr[1]->zero_point;
|
||||
input1_scale = attr[1]->scale;
|
||||
output_zp = attr[2]->zero_point;
|
||||
output_scale = 1.0f / attr[2]->scale;
|
||||
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
|
||||
(IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
|
||||
|
|
@ -454,30 +403,52 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_tensor_t* tmp_inputs[2] = { NULL };
|
||||
vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
|
||||
vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
|
||||
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
|
||||
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
|
||||
vsi_size_t new_rank = 0;
|
||||
vsi_bool ret = TRUE;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(params);
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
ret = vsi_nn_kernel_optimize_eltwise_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num,
|
||||
inputs[1]->attr.size, inputs[1]->attr.dim_num,
|
||||
outputs[0]->attr.size, outputs[0]->attr.dim_num,
|
||||
shapes[0], shapes[1], shapes[2], &new_rank );
|
||||
|
||||
if (ret == FALSE)
|
||||
{
|
||||
return NULL;
|
||||
goto final;
|
||||
}
|
||||
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], new_rank );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[1], shapes[1], new_rank );
|
||||
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[2], new_rank );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
|
||||
reshape_tensors[2]->attr.dim_num ) )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
// Reorder tensor
|
||||
if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 )
|
||||
{
|
||||
int32_t order[2] = {1, 0};
|
||||
vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs );
|
||||
vsi_nn_reorder_tensor( reshape_tensors, order, 2, tmp_inputs );
|
||||
}
|
||||
else
|
||||
{
|
||||
memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 );
|
||||
memmove( tmp_inputs, reshape_tensors, sizeof(vsi_nn_tensor_t*) * 2 );
|
||||
}
|
||||
|
||||
image_2d = (outputs[0]->attr.dim_num == 2);
|
||||
status = _query_kernel( tmp_inputs, outputs, image_2d, kernel );
|
||||
image_2d = (reshape_tensors[2]->attr.dim_num == 2);
|
||||
status = _query_kernel( tmp_inputs, &reshape_tensors[2], image_2d, kernel );
|
||||
if ( VSI_SUCCESS == status )
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -485,10 +456,16 @@ static vsi_nn_kernel_node_t _setup
|
|||
{
|
||||
/* Pass parameters to node. */
|
||||
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
|
||||
tmp_inputs, 2, outputs, 1 );
|
||||
tmp_inputs, 2, &reshape_tensors[2], 1 );
|
||||
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_safe_release_tensor(reshape_tensors[0]);
|
||||
vsi_safe_release_tensor(reshape_tensors[1]);
|
||||
vsi_safe_release_tensor(reshape_tensors[2]);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
|
|
|
|||
|
|
@ -163,63 +163,12 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
|
|||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = attr[2]->shape;
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[0]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input0_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input0_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
input0_zp = attr[0]->asymm.zero_point;
|
||||
input0_scale = attr[0]->asymm.scale;
|
||||
}
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[1]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input1_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input1_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
input1_zp = attr[1]->asymm.zero_point;
|
||||
input1_scale = attr[1]->asymm.scale;
|
||||
}
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[2]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
output_scale = (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = 1.0f / (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
output_zp = attr[2]->asymm.zero_point;
|
||||
output_scale = 1.0f / attr[2]->asymm.scale;
|
||||
}
|
||||
input0_zp = attr[0]->zero_point;
|
||||
input0_scale = attr[0]->scale;
|
||||
input1_zp = attr[1]->zero_point;
|
||||
input1_scale = attr[1]->scale;
|
||||
output_zp = attr[2]->zero_point;
|
||||
output_scale = 1.0f / attr[2]->scale;
|
||||
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
|
||||
(IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
|
||||
|
|
@ -454,30 +403,52 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_tensor_t* tmp_inputs[2] = { NULL };
|
||||
vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
|
||||
vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
|
||||
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
|
||||
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
|
||||
vsi_size_t new_rank = 0;
|
||||
vsi_bool ret = TRUE;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(params);
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
ret = vsi_nn_kernel_optimize_eltwise_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num,
|
||||
inputs[1]->attr.size, inputs[1]->attr.dim_num,
|
||||
outputs[0]->attr.size, outputs[0]->attr.dim_num,
|
||||
shapes[0], shapes[1], shapes[2], &new_rank );
|
||||
|
||||
if (ret == FALSE)
|
||||
{
|
||||
return NULL;
|
||||
goto final;
|
||||
}
|
||||
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], new_rank );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[1], shapes[1], new_rank );
|
||||
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[2], new_rank );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
|
||||
reshape_tensors[2]->attr.dim_num ) )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
// Reorder tensor
|
||||
if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 )
|
||||
{
|
||||
int32_t order[2] = {1, 0};
|
||||
vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs );
|
||||
vsi_nn_reorder_tensor( reshape_tensors, order, 2, tmp_inputs );
|
||||
}
|
||||
else
|
||||
{
|
||||
memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 );
|
||||
memmove( tmp_inputs, reshape_tensors, sizeof(vsi_nn_tensor_t*) * 2 );
|
||||
}
|
||||
|
||||
image_2d = (outputs[0]->attr.dim_num == 2);
|
||||
status = _query_kernel( tmp_inputs, outputs, image_2d, kernel );
|
||||
image_2d = (reshape_tensors[2]->attr.dim_num == 2);
|
||||
status = _query_kernel( tmp_inputs, &reshape_tensors[2], image_2d, kernel );
|
||||
if ( VSI_SUCCESS == status )
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
@ -485,10 +456,16 @@ static vsi_nn_kernel_node_t _setup
|
|||
{
|
||||
/* Pass parameters to node. */
|
||||
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
|
||||
tmp_inputs, 2, outputs, 1 );
|
||||
tmp_inputs, 2, &reshape_tensors[2], 1 );
|
||||
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_safe_release_tensor(reshape_tensors[0]);
|
||||
vsi_safe_release_tensor(reshape_tensors[1]);
|
||||
vsi_safe_release_tensor(reshape_tensors[2]);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
|
|
|
|||
|
|
@ -128,9 +128,6 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
|
|||
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
|
||||
vsi_size_array_t *output_shape = NULL;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = F16;
|
||||
int32_t input0_fl = 0;
|
||||
int32_t input1_fl = 0;
|
||||
int32_t output_fl = 0;
|
||||
float inScale0 = 1.0f;
|
||||
float inScale1 = 1.0f;
|
||||
float outScale = 1.0f;
|
||||
|
|
@ -168,59 +165,12 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
|
|||
(output_shape->data[2] + gpu_param.global_scale[2] - 1)
|
||||
/ gpu_param.global_scale[2] : 1;
|
||||
|
||||
if (input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
input0_fl = input0_attr->dfp.fl;
|
||||
if (input0_fl > 0)
|
||||
{
|
||||
inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inScale0 = (float)((int64_t)1 << -input0_fl);
|
||||
}
|
||||
}
|
||||
else if (input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
inScale0 = input0_attr->asymm.scale;
|
||||
in0Tail = -inScale0 * ((float)input0_attr->asymm.zero_point);
|
||||
}
|
||||
|
||||
if (input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
input1_fl = input1_attr->dfp.fl;
|
||||
if (input1_fl > 0)
|
||||
{
|
||||
inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inScale1 = (float)((int64_t)1 << -input1_fl);
|
||||
}
|
||||
}
|
||||
else if (input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
inScale1 = input1_attr->asymm.scale;
|
||||
in1Tail = -inScale1 * ((float)input1_attr->asymm.zero_point);
|
||||
}
|
||||
|
||||
if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
output_fl = output_attr->dfp.fl;
|
||||
if (output_fl > 0)
|
||||
{
|
||||
outScale = (float) ((int64_t)1 << output_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outScale = 1.0f / (float)((int64_t)1 << -output_fl);
|
||||
}
|
||||
}
|
||||
else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
outScale = 1.0f / output_attr->asymm.scale;
|
||||
outZp = (float)(output_attr->asymm.zero_point);
|
||||
}
|
||||
inScale0 = input0_attr->scale;
|
||||
in0Tail = 0 - inScale0 * ((float)input0_attr->zero_point);
|
||||
inScale1 = input1_attr->scale;
|
||||
in1Tail = 0 - inScale1 * ((float)input1_attr->zero_point);
|
||||
outScale = 1.0f / output_attr->scale;
|
||||
outZp = (float)(output_attr->zero_point);
|
||||
|
||||
if (BF16 == input0_dtype)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -239,76 +239,12 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
input_shape = attr[0]->shape;
|
||||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
input_zp = attr[0]->asymm.zero_point;
|
||||
scaleIn = attr[0]->asymm.scale;
|
||||
}
|
||||
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
|
||||
input_zp = 0;
|
||||
}
|
||||
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
|
||||
{
|
||||
input_zp = 0;
|
||||
scaleIn = 1;
|
||||
}
|
||||
|
||||
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
output_ZP0 = (float)attr[1]->asymm.zero_point;
|
||||
outputScale0 = 1.0f / attr[1]->asymm.scale;
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[1]->dfp.fl > 0)
|
||||
{
|
||||
outputScale0 = (float)((int64_t)1 << attr[1]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale0 = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
|
||||
}
|
||||
output_ZP0 = 0.0f;
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
outputScale0 = 1.0f;
|
||||
output_ZP0 = 0.0f;
|
||||
}
|
||||
|
||||
if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
output_ZP1 = (float)attr[2]->asymm.zero_point;
|
||||
outputScale1 = 1.0f / attr[2]->asymm.scale;
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[2]->dfp.fl > 0)
|
||||
{
|
||||
outputScale1 = (float)((int64_t)1 << attr[2]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale1 = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
|
||||
}
|
||||
output_ZP1 = 0.0f;
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
outputScale1 = 1.0f;
|
||||
output_ZP1 = 0.0f;
|
||||
}
|
||||
input_zp = attr[0]->zero_point;
|
||||
scaleIn = attr[0]->scale;
|
||||
output_ZP0 = (float)attr[1]->zero_point;
|
||||
outputScale0 = 1.0f / attr[1]->scale;
|
||||
output_ZP1 = (float)attr[2]->zero_point;
|
||||
outputScale1 = 1.0f / attr[2]->scale;
|
||||
|
||||
output_ZP[0] = output_ZP0;
|
||||
output_ZP[1] = output_ZP1;
|
||||
|
|
|
|||
|
|
@ -160,16 +160,13 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
|
|||
in_shape = attr[0]->shape;
|
||||
depth = (int32_t)(attr[1]->shape->data[1]);
|
||||
input_dtype = attr[0]->dtype;
|
||||
input_zp = attr[0]->zero_point;
|
||||
scaleIn = attr[0]->scale;
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant)
|
||||
{
|
||||
srcFixPointPos = attr[0]->dfp.fl;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant)
|
||||
{
|
||||
input_zp = attr[0]->asymm.zero_point;
|
||||
scaleIn = attr[0]->asymm.scale;
|
||||
}
|
||||
|
||||
if (suffix_size == 1)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -155,41 +155,19 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer)
|
|||
input_shape = input_attr->shape;
|
||||
src_dtype = input_attr->dtype;
|
||||
dst_dtype = output_attr->dtype;
|
||||
inputScale = input_attr->scale;
|
||||
input_ZP = input_attr->zero_point;
|
||||
outputScale = output_attr->scale;
|
||||
output_ZP = output_attr->zero_point;
|
||||
|
||||
if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
input_fl = input_attr->dfp.fl;
|
||||
if (input_fl > 0)
|
||||
{
|
||||
inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = (float)((int64_t)1 << -input_fl);
|
||||
}
|
||||
}
|
||||
else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
inputScale = input_attr->asymm.scale;
|
||||
input_ZP = input_attr->asymm.zero_point;
|
||||
}
|
||||
|
||||
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
output_fl = output_attr->dfp.fl;
|
||||
if (output_fl > 0)
|
||||
{
|
||||
outputScale = 1.0f / (float) ((int64_t)1 << output_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = (float)((int64_t)1 << -output_fl);
|
||||
}
|
||||
}
|
||||
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
outputScale = output_attr->asymm.scale;
|
||||
output_ZP = output_attr->asymm.zero_point;
|
||||
}
|
||||
|
||||
if ( ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#if !(VX_TENSOR_POW_API_SUPPORT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -158,64 +159,13 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
|
|||
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
out_shape = attr[2]->shape;
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[0]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input0_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input0_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
input0_scale = attr[0]->asymm.scale;
|
||||
input0_tail = 0 - (float)attr[0]->asymm.zero_point * input0_scale;
|
||||
}
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[1]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input1_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input1_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
input1_scale = attr[1]->asymm.scale;
|
||||
input1_tail = 0 - (float)attr[1]->asymm.zero_point * input1_scale;
|
||||
}
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[2]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
output_scale = (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = 1.0f / (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
output_zp = (float)attr[2]->asymm.zero_point;
|
||||
output_scale = 1.0f / attr[2]->asymm.scale;
|
||||
}
|
||||
out_shape = attr[2]->shape;
|
||||
input0_scale = attr[0]->scale;
|
||||
input0_tail = 0 - (float)attr[0]->zero_point * input0_scale;
|
||||
input1_scale = attr[1]->scale;
|
||||
input1_tail = 0 - (float)attr[1]->zero_point * input1_scale;
|
||||
output_zp = (float)attr[2]->zero_point;
|
||||
output_scale = 1.0f / attr[2]->scale;
|
||||
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
|
||||
(IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
|
||||
|
|
@ -454,3 +404,4 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( pow, _setup )
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -140,28 +140,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
|
|||
}
|
||||
enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));
|
||||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
dstZP = 0;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
outputScale = 1.0f / attr[0]->asymm.scale;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
outputScale = 1;
|
||||
dstZP = 0;
|
||||
}
|
||||
outputScale = 1.0f / attr[0]->scale;
|
||||
dstZP = attr[0]->zero_point;
|
||||
|
||||
shaderParam.global_scale[0] = 4;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
|
|
|
|||
|
|
@ -133,28 +133,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
|
|||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
dstZP = 0.0f;
|
||||
}
|
||||
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
outputScale = 1.0f / attr[0]->asymm.scale;
|
||||
dstZP = (float)attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
outputScale = 1;
|
||||
dstZP = 0.0f;
|
||||
}
|
||||
outputScale = 1.0f / attr[0]->scale;
|
||||
dstZP = (float)attr[0]->zero_point;
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
|
|
@ -232,33 +212,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_initializer)
|
|||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
dstZP = (float)attr[0]->asymm.zero_point;
|
||||
outputScale = attr[0]->asymm.scale;
|
||||
dstZP = (float)attr[0]->zero_point;
|
||||
outputScale = 1.0f / attr[0]->scale;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
dstZP = 0.0f;
|
||||
}
|
||||
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
outputScale = 1.0f/outputScale;
|
||||
}
|
||||
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
outputScale = 1;
|
||||
dstZP = 0.0f;
|
||||
}
|
||||
|
||||
shaderParam.global_scale[0] = 4;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
|
|
@ -499,8 +457,8 @@ OnError:
|
|||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,884 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOF16 \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toF16")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI16 \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI16")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOU8 \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI8 \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOU8 \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOI8 \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toI8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOI16 \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toI16")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOF16 \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toF16")
|
||||
|
||||
// greater than a quarter
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOU8_GQ \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8_gq")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI8_GQ \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8_gq")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOF16_GQ \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toF16_gq")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI16_GQ \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI16_gq")
|
||||
|
||||
#define KERNEL_SOURCE_1 "pre_process_nv12_rggb_copy",
|
||||
#define KERNEL_SOURCE_2 "pre_process_nv12_rggb_scale",
|
||||
|
||||
typedef enum
|
||||
{
|
||||
COPY = 0,
|
||||
SCALE,
|
||||
TRANS
|
||||
} vsi_nn_kernel_convert_type_e;
|
||||
|
||||
#define HASH_PRE_PROCESS_NV12_RGGB_KEY(_input0_type, _output_type, _convert_type, _greater_quarter) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8) | (_greater_quarter))
|
||||
|
||||
#define TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \
|
||||
{ HASH_PRE_PROCESS_NV12_RGGB_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 0), \
|
||||
VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \
|
||||
{ HASH_PRE_PROCESS_NV12_RGGB_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 1), \
|
||||
VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE##_GQ, \
|
||||
SOURCE },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
const char* source_name;
|
||||
} pre_process_nv12_rggb_map[] =
|
||||
{
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I8, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I16, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, F16, COPY, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2)
|
||||
};
|
||||
|
||||
static vx_param_description_t vxPreProcessNv12_RGGBKernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM _cnt_of_array(vxPreProcessNv12_RGGBKernel_param_def)
|
||||
|
||||
static vsi_bool _check_nv12_type_from_env()
|
||||
{
|
||||
vsi_bool ret = FALSE;
|
||||
char* env_s = vsi_nn_getenv("VSI_NN_ENABLE_OCV_NV12");
|
||||
if (env_s)
|
||||
{
|
||||
ret = TRUE;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_pre_process_nv12_rggb_copy_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t shaderParam = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
float output_zp = 0;
|
||||
float output_scale = 1;
|
||||
int32_t reorder = 0;
|
||||
int32_t order1 = 3;
|
||||
uint32_t width = 0;
|
||||
uint32_t height = 0;
|
||||
int32_t nv_type = 0;
|
||||
float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
|
||||
float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
|
||||
float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
|
||||
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
vsi_bool ocv_nv12 = _check_nv12_type_from_env();
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &nv_type);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
output_scale = 1.0f / attr[0]->scale;
|
||||
output_zp = (float)attr[0]->zero_point;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
if (reorder != 0)
|
||||
{
|
||||
reorder = 3;
|
||||
order1 = 0;
|
||||
}
|
||||
|
||||
if (nv_type == VSI_NN_YUV_TYPE_NV21_BGGR)
|
||||
{
|
||||
int32_t tmporder = reorder;
|
||||
reorder = order1;
|
||||
order1 = tmporder;
|
||||
}
|
||||
|
||||
outputScaleVar_b = output_scale * b_scale;
|
||||
outputScaleVar_g = output_scale * g_scale;
|
||||
outputScaleVar_r = output_scale * r_scale;
|
||||
bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
|
||||
gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
|
||||
rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
|
||||
|
||||
shaderParam.global_scale[0] = 4;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
|
||||
/ shaderParam.global_scale[0], 4);
|
||||
shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
|
||||
/ shaderParam.global_scale[1], 2);
|
||||
shaderParam.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertNV12toB_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00210000, 0x00630042, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000,
|
||||
0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertNV12toG_4x4 = {{
|
||||
0x29292929, // TCfg
|
||||
0x14141414, // ASelt
|
||||
0x03210100, 0x07630542, // ABin
|
||||
0x2a2a2a2a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
|
||||
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertNV12toR_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00310010, 0x00730052, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
|
||||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractYtoShortSub16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
|
||||
0x99999999, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x01000100, 0x03020302, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001,
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000,
|
||||
0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000,
|
||||
0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (ocv_nv12)
|
||||
{
|
||||
uniConvertNV12toB_4x4.data[2] = 0x00010000;
|
||||
uniConvertNV12toB_4x4.data[3] = 0x00230022;
|
||||
uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
|
||||
|
||||
uniConvertNV12toG_4x4.data[2] = 0x01010100;
|
||||
uniConvertNV12toG_4x4.data[3] = 0x03230322;
|
||||
uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[9] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[11] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[13] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[15] = 0x00003a81;
|
||||
|
||||
uniConvertNV12toR_4x4.data[2] = 0x00110010;
|
||||
uniConvertNV12toR_4x4.data[3] = 0x00330032;
|
||||
uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
|
||||
|
||||
uniExtractUVtoCharSub128_2x8.data[2] = 0x03020100;
|
||||
uniExtractUVtoCharSub128_2x8.data[3] = 0x07060504;
|
||||
|
||||
uniExtractYtoShortSub16_2x8.data[0] = 0x99999999;
|
||||
uniExtractYtoShortSub16_2x8.data[1] = 0x44444444;
|
||||
uniExtractYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
|
||||
uniExtractYtoShortSub16_2x8.data[8] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[9] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[10] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[11] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[12] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[13] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[14] = 0x00010001;
|
||||
uniExtractYtoShortSub16_2x8.data[15] = 0x00010001;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_2x8", &uniExtractYtoShortSub16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
switch( attr[0]->dtype )
|
||||
{
|
||||
case U8:
|
||||
case I8:
|
||||
case I16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case F16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _pre_process_nv12_rggb_copy_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_pre_process_nv12_rggb_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t shaderParam = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
float output_zp = 0;
|
||||
float output_scale = 1;
|
||||
int32_t reorder = 0;
|
||||
int32_t order1 = 3;
|
||||
uint32_t width = 0;
|
||||
uint32_t height = 0;
|
||||
uint32_t roi_width = 0;
|
||||
uint32_t roi_height = 0;
|
||||
uint32_t xrIntFloat_16 = 0;
|
||||
uint32_t yrIntFloat_16 = 0;
|
||||
int32_t xRatio = 0;
|
||||
int32_t yRatio = 0;
|
||||
int32_t nv_type = 0;
|
||||
float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
|
||||
float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
|
||||
float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
|
||||
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
|
||||
float resize = 0.0f;
|
||||
vsi_bool ocv_nv12 = _check_nv12_type_from_env();
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &xRatio);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &yRatio);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &nv_type);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
output_scale = 1.0f / attr[1]->scale;
|
||||
output_zp = (float)attr[1]->zero_point;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
if (reorder != 0)
|
||||
{
|
||||
reorder = 3;
|
||||
order1 = 0;
|
||||
}
|
||||
|
||||
if (nv_type == VSI_NN_YUV_TYPE_NV21_BGGR)
|
||||
{
|
||||
int32_t tmporder = reorder;
|
||||
reorder = order1;
|
||||
order1 = tmporder;
|
||||
}
|
||||
|
||||
roi_width = (xRatio * width) >> 15;
|
||||
roi_height = (yRatio * height) >> 15;
|
||||
resize = (float)width / roi_width;
|
||||
xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
|
||||
yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
|
||||
|
||||
outputScaleVar_b = output_scale * b_scale;
|
||||
outputScaleVar_g = output_scale * g_scale;
|
||||
outputScaleVar_r = output_scale * r_scale;
|
||||
bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
|
||||
gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
|
||||
rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
|
||||
|
||||
shaderParam.global_scale[0] = 4;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
|
||||
/ shaderParam.global_scale[0], 4);
|
||||
shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
|
||||
/ shaderParam.global_scale[1], 2);
|
||||
shaderParam.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertNV12toB_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00210000, 0x00630042, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000,
|
||||
0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertNV12toG_4x4 = {{
|
||||
0x29292929, // TCfg
|
||||
0x14141414, // ASelt
|
||||
0x03210100, 0x07630542, // ABin
|
||||
0x2a2a2a2a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
|
||||
0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertNV12toR_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00310010, 0x00730052, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
|
||||
0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertYtoShortSub16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertUVtoCharSub128_2x8 = {{
|
||||
0x99999999, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001,
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
//trans
|
||||
gpu_dp_inst_t uniCalculateYShift_2x8 = {{
|
||||
0x00009999, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x06040200, 0x00000000, // ABin
|
||||
0x00005555, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniCalculateUVShift_2x8 = {{
|
||||
0x51515151, // TCfg
|
||||
0x40404040, // ASelt
|
||||
0x02020000, 0x06060404, // ABin
|
||||
0x91919191, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00010000, 0x00000000, 0x00010000,
|
||||
0x00000000, 0x00010000, 0x00000000, 0x00010000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000,
|
||||
0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000,
|
||||
0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (ocv_nv12)
|
||||
{
|
||||
uniConvertNV12toB_4x4.data[2] = 0x00010000;
|
||||
uniConvertNV12toB_4x4.data[3] = 0x00230022;
|
||||
uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
|
||||
uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
|
||||
|
||||
uniConvertNV12toG_4x4.data[2] = 0x01010100;
|
||||
uniConvertNV12toG_4x4.data[3] = 0x03230322;
|
||||
uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[9] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[11] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[13] = 0x00003a81;
|
||||
uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
|
||||
uniConvertNV12toG_4x4.data[15] = 0x00003a81;
|
||||
|
||||
uniConvertNV12toR_4x4.data[2] = 0x00110010;
|
||||
uniConvertNV12toR_4x4.data[3] = 0x00330032;
|
||||
uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
|
||||
uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
|
||||
|
||||
uniConvertYtoShortSub16_2x8.data[0] = 0x99999999;
|
||||
uniConvertYtoShortSub16_2x8.data[1] = 0x44444444;
|
||||
uniConvertYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
|
||||
uniConvertYtoShortSub16_2x8.data[8] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[9] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[10] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[11] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[12] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[13] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[14] = 0x00010001;
|
||||
uniConvertYtoShortSub16_2x8.data[15] = 0x00010001;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYtoShortSub16_2x8", &uniConvertYtoShortSub16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
|
||||
|
||||
if (resize >= 0.25)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8);
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
switch( attr[1]->dtype )
|
||||
{
|
||||
case U8:
|
||||
case I8:
|
||||
case I16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case F16:
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _pre_process_nv12_rggb_initializer() */
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
int32_t scale_x
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
vsi_nn_kernel_convert_type_e convert_type = SCALE;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
uint32_t key = 0;
|
||||
size_t i = 0;
|
||||
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
|
||||
vsi_size_t dstWidth = outputs[0]->attr.size[0];
|
||||
float scaleVal = (float)dstWidth / ((scale_x * dstWidth) >> 15);
|
||||
uint32_t optFlg = 0;
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (enable_copy)
|
||||
{
|
||||
convert_type = COPY;
|
||||
}
|
||||
else
|
||||
{
|
||||
convert_type = SCALE;
|
||||
}
|
||||
|
||||
if (scaleVal >= 0.25 && convert_type == SCALE)
|
||||
{
|
||||
optFlg = 1;
|
||||
}
|
||||
|
||||
key = HASH_PRE_PROCESS_NV12_RGGB_KEY( input0_dtype, output_dtype, convert_type, optFlg );
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(pre_process_nv12_rggb_map); i ++ )
|
||||
{
|
||||
if ( pre_process_nv12_rggb_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(pre_process_nv12_rggb_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_nv12_rggb_map[i].function_name );
|
||||
kernel->info.parameters = vxPreProcessNv12_RGGBKernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( vxPreProcessNv12_RGGBKernel_param_def );
|
||||
|
||||
if (convert_type == COPY)
|
||||
{
|
||||
kernel->info.initialize = _pre_process_nv12_rggb_copy_initializer;
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel->info.initialize = _pre_process_nv12_rggb_initializer;
|
||||
}
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
pre_process_nv12_rggb_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
pre_process_nv12_rggb_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t trans = 0;
|
||||
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel, params, scale_x );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 3;
|
||||
int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
|
||||
int32_t left = vsi_nn_kernel_param_get_int32( params, "left" );
|
||||
int32_t top = vsi_nn_kernel_param_get_int32( params, "top" );
|
||||
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
|
||||
float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
|
||||
float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
|
||||
float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
|
||||
float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
|
||||
float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
|
||||
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
|
||||
int32_t nv_type = vsi_nn_kernel_param_get_int32( params, "nv_type" );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM,
|
||||
inputs, 2, outputs, 1 );
|
||||
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &nv_type );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
|
||||
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[11] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[12] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[13] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[14] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[15] );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( pre_process_nv12_rggb, _setup )
|
||||
|
||||
|
|
@ -403,23 +403,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
|
|||
|
||||
out_shape = attr[0]->shape;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if ( attr[0]->dfp.fl > 0 )
|
||||
{
|
||||
output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
output_zp = (float)attr[0]->asymm.zero_point;
|
||||
output_scale /= attr[0]->asymm.scale;
|
||||
}
|
||||
output_zp = (float)attr[0]->zero_point;
|
||||
output_scale = 1.0f / attr[0]->scale;
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
|
|
@ -620,8 +605,8 @@ OnError:
|
|||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
|
|
|
|||
|
|
@ -463,22 +463,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
|
|||
width = (uint32_t)(out_shape->data[0] / 3);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if ( attr[0]->dfp.fl > 0 )
|
||||
{
|
||||
output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
output_zp = (float)attr[0]->asymm.zero_point;
|
||||
output_scale /= attr[0]->asymm.scale;
|
||||
}
|
||||
output_zp = (float)attr[0]->zero_point;
|
||||
output_scale = 1.0f / attr[0]->scale;
|
||||
|
||||
if (attr[0]->dtype == F16 || attr[0]->dtype == I16)
|
||||
{
|
||||
|
|
@ -787,8 +773,8 @@ OnError:
|
|||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
|
|
|
|||
|
|
@ -179,28 +179,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
|
|||
}
|
||||
enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));
|
||||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
outputZP = 0;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
outputScale = 1.0f / attr[0]->asymm.scale;
|
||||
outputZP = (float)attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
outputScale = 1;
|
||||
outputZP = 0;
|
||||
}
|
||||
outputScale = 1.0f / attr[0]->scale;
|
||||
outputZP = (float)attr[0]->zero_point;
|
||||
|
||||
#define _PACK_SELECT_KEY( COPY_FLAG, REVERSE_FLAG, TRANS_FLAG) \
|
||||
(COPY_FLAG | (REVERSE_FLAG << 24) | (TRANS_FLAG << 16) )
|
||||
|
|
|
|||
|
|
@ -143,23 +143,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
|
|||
order1 = 0;
|
||||
}
|
||||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
dstScale = 1.0f / attr[0]->asymm.scale;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
dstZP = 0;
|
||||
}
|
||||
dstScale = 1.0f / attr[0]->scale;
|
||||
dstZP = attr[0]->zero_point;
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
|
|
@ -501,8 +486,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
dstScale = attr[0]->asymm.scale;
|
||||
dstZP = attr[0]->zero_point;
|
||||
dstScale = 1.0f / attr[0]->scale;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
|
|
@ -512,28 +497,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
|
|||
order1 = 0;
|
||||
}
|
||||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
dstZP = 0;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
dstScale = 1.0f / dstScale;
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
dstScale = 1;
|
||||
dstZP = 0;
|
||||
}
|
||||
|
||||
shaderParam.global_scale[0] = 4;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
|
|
|
|||
|
|
@ -164,46 +164,24 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
|
|||
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = attr[2]->shape;
|
||||
out_shape = attr[2]->shape;
|
||||
inputZP0 = attr[0]->zero_point;
|
||||
input_scale0 = attr[0]->scale;
|
||||
inputZP1 = attr[1]->zero_point;
|
||||
input_scale1 = attr[1]->scale;
|
||||
outputZP = (float)attr[2]->zero_point;
|
||||
input_scale0 = input_scale0 / attr[2]->scale;
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
in0_fl = (int8_t)attr[0]->dfp.fl;
|
||||
if (in0_fl >= 0)
|
||||
{
|
||||
input_scale0 = 1.0f / (vx_float32) ((int64_t)1 << in0_fl);
|
||||
}
|
||||
else if (in0_fl < 0)
|
||||
{
|
||||
input_scale0 = (vx_float32) ((int64_t)1 << -in0_fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
inputZP0 = attr[0]->asymm.zero_point;
|
||||
input_scale0 = attr[0]->asymm.scale;
|
||||
}
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
inputZP1 = attr[1]->asymm.zero_point;
|
||||
input_scale1 = attr[1]->asymm.scale;
|
||||
}
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
out_fl = (int8_t)attr[2]->dfp.fl;
|
||||
}
|
||||
|
||||
if (out_fl >= 0)
|
||||
input_scale0 *= (vx_float32)((int64_t)1 << out_fl);
|
||||
else if (out_fl < 0)
|
||||
input_scale0 *= 1.0f / (vx_float32) ((int64_t)1 << -out_fl);
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
out_fl = 1;
|
||||
outputZP = (float)attr[2]->asymm.zero_point;
|
||||
input_scale0 = input_scale0 / attr[2]->asymm.scale;
|
||||
}
|
||||
shift0 = in0_fl - out_fl;
|
||||
|
||||
is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);
|
||||
|
|
|
|||
|
|
@ -152,7 +152,6 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
|
|||
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
|
||||
vsi_size_array_t * input_shape = NULL;
|
||||
vsi_size_array_t * output_shape = NULL;
|
||||
int32_t input_fl = 0, output_fl = 0;
|
||||
int32_t axisSize = 0;
|
||||
float inputScale = 1.0f;
|
||||
float input_offset_asymmetric = 0.0f;
|
||||
|
|
@ -257,68 +256,19 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
|
|||
}
|
||||
}
|
||||
|
||||
if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
input_fl = input_attr->dfp.fl;
|
||||
if (input_fl > 0)
|
||||
{
|
||||
inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = (float)((int64_t)1 << -input_fl);
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
inputScale = input_attr->asymm.scale;
|
||||
input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = 1.0f;
|
||||
input_offset_asymmetric = 0;
|
||||
inputScale = input_attr->scale;
|
||||
input_offset_asymmetric = (float)(input_attr->zero_point);
|
||||
outputScale = 1.0f / output_attr->scale;
|
||||
output_offset_asymmetric = (float)(output_attr->zero_point);
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
output_fl = output_attr->dfp.fl;
|
||||
if (output_fl > 0)
|
||||
{
|
||||
outputScale = (float) ((int64_t)1 << output_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
outputScale = 1.0f / output_attr->asymm.scale;
|
||||
output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = 1.0f;
|
||||
output_offset_asymmetric = 0;
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
|
|
|||
|
|
@ -154,7 +154,6 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
|
|||
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
|
||||
vsi_size_array_t * input_shape = NULL;
|
||||
vsi_size_array_t * output_shape = NULL;
|
||||
int32_t input_fl = 0, output_fl = 0;
|
||||
int32_t axisSize = 0;
|
||||
float inputScale = 1.0f;
|
||||
float input_offset_asymmetric = 0.0f;
|
||||
|
|
@ -259,68 +258,18 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
|
|||
}
|
||||
}
|
||||
|
||||
if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
input_fl = input_attr->dfp.fl;
|
||||
if (input_fl > 0)
|
||||
{
|
||||
inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = (float)((int64_t)1 << -input_fl);
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
inputScale = input_attr->asymm.scale;
|
||||
input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = 1.0f;
|
||||
input_offset_asymmetric = 0;
|
||||
inputScale = input_attr->scale;
|
||||
input_offset_asymmetric = (float)(input_attr->zero_point);
|
||||
outputScale = 1.0f / output_attr->scale;
|
||||
output_offset_asymmetric = (float)(output_attr->zero_point);
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
output_fl = output_attr->dfp.fl;
|
||||
if (output_fl > 0)
|
||||
{
|
||||
outputScale = (float) ((int64_t)1 << output_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
outputScale = 1.0f / output_attr->asymm.scale;
|
||||
output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = 1.0f;
|
||||
output_offset_asymmetric = 0;
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
|
|
|||
|
|
@ -160,7 +160,6 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
|
|||
vsi_size_array_t * output_shape = NULL;
|
||||
vsi_nn_kernel_dtype_e src_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e dst_dtype = F16;
|
||||
int32_t input_fl = 0, output_fl = 0;
|
||||
int32_t axisSize = 0;
|
||||
float inputScale = 1.0f;
|
||||
float input_offset_asymmetric = 0.0f;
|
||||
|
|
@ -348,68 +347,17 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
input_fl = input_attr->dfp.fl;
|
||||
if (input_fl > 0)
|
||||
{
|
||||
inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = (float)((int64_t)1 << -input_fl);
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
inputScale = input_attr->asymm.scale;
|
||||
input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = 1.0f;
|
||||
input_offset_asymmetric = 0;
|
||||
inputScale = input_attr->scale;
|
||||
input_offset_asymmetric = (float)(input_attr->zero_point);
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
output_fl = output_attr->dfp.fl;
|
||||
if (output_fl > 0)
|
||||
{
|
||||
outputScale = (float) ((int64_t)1 << output_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
outputScale = 1.0f / output_attr->asymm.scale;
|
||||
output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = 1.0f;
|
||||
output_offset_asymmetric = 0;
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
outputScale = 1.0f / output_attr->scale;
|
||||
output_offset_asymmetric = (float)(output_attr->zero_point);
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
|
|
|||
|
|
@ -138,8 +138,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
|
|||
float inputTail = 0.0f;
|
||||
float output_ZP = 0;
|
||||
float input_ZP = 0;
|
||||
int32_t srcFixPointPos = 0;
|
||||
int32_t dstFixPointPos = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
|
|
@ -154,25 +152,10 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
|
|||
output_dtype = output_attr->dtype;
|
||||
offset = alpha * threshold;
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
{
|
||||
srcFixPointPos = input_attr->dfp.fl;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
|
||||
{
|
||||
input_ZP = (float)(input_attr->asymm.zero_point);
|
||||
scaleIn = input_attr->asymm.scale;
|
||||
}
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
|
||||
{
|
||||
dstFixPointPos = output_attr->dfp.fl;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
|
||||
{
|
||||
output_ZP = (float)(output_attr->asymm.zero_point);
|
||||
scaleOut = 1.0f / output_attr->asymm.scale;
|
||||
}
|
||||
input_ZP = (float)(input_attr->zero_point);
|
||||
scaleIn = input_attr->scale;
|
||||
output_ZP = (float)(output_attr->zero_point);
|
||||
scaleOut = 1.0f / output_attr->scale;
|
||||
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
|
|
@ -195,11 +178,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
|
|||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
{
|
||||
if (srcFixPointPos >=0 )
|
||||
scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
|
||||
else
|
||||
scaleIn = (float) ((int64_t)1 << -srcFixPointPos);
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
|
@ -212,11 +190,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
|
|||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
|
||||
{
|
||||
if (dstFixPointPos >=0 )
|
||||
scaleOut = (float) ((int64_t)1 << dstFixPointPos);
|
||||
else
|
||||
scaleOut = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "output_scale", &scaleOut);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -197,8 +197,6 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
|
|||
int32_t half_pixel_centers = 0;
|
||||
|
||||
uint32_t depth = 0;
|
||||
int32_t srcFixPointPos = 0;
|
||||
int32_t dstFixPointPos = 0;
|
||||
float input_scale = 1.0;
|
||||
int32_t inputZP = 0;
|
||||
float output_scale = 1.0;
|
||||
|
|
@ -259,53 +257,10 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
|
|||
half_pixel_value = 0.0f;
|
||||
}
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
|
||||
{
|
||||
input_scale = input_attr->asymm.scale;
|
||||
inputZP = input_attr->asymm.zero_point;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
{
|
||||
srcFixPointPos = input_attr->dfp.fl;
|
||||
if (srcFixPointPos >= 0)
|
||||
{
|
||||
input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
|
||||
}
|
||||
else if (srcFixPointPos < 0)
|
||||
{
|
||||
input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos);
|
||||
}
|
||||
inputZP = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = 1.0f;
|
||||
inputZP = 0;
|
||||
}
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
|
||||
{
|
||||
output_scale = output_attr->asymm.scale;
|
||||
outputZP = output_attr->asymm.zero_point;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
|
||||
{
|
||||
dstFixPointPos = output_attr->dfp.fl;
|
||||
if (dstFixPointPos >= 0)
|
||||
{
|
||||
output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos);
|
||||
}
|
||||
else if (dstFixPointPos < 0)
|
||||
{
|
||||
output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
|
||||
}
|
||||
outputZP = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = 1.0;
|
||||
outputZP = 0;
|
||||
}
|
||||
input_scale = input_attr->scale;
|
||||
inputZP = input_attr->zero_point;
|
||||
output_scale = output_attr->scale;
|
||||
outputZP = output_attr->zero_point;
|
||||
|
||||
if (is_run_nx_kernel)
|
||||
{
|
||||
|
|
@ -473,7 +428,7 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
|
|||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
{
|
||||
float dfpScale = input_scale * output_scale;
|
||||
float dfpScale = input_scale / output_scale;
|
||||
gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
|
|
|
|||
|
|
@ -198,52 +198,19 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
|
|||
half_pixel_value = 0.0f;
|
||||
}
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
|
||||
{
|
||||
input_scale = input_attr->asymm.scale;
|
||||
inputZP = input_attr->asymm.zero_point;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
input_scale = input_attr->scale;
|
||||
inputZP = input_attr->zero_point;
|
||||
output_scale = 1.0f / output_attr->scale;
|
||||
outputZP = output_attr->zero_point;
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
{
|
||||
srcFixPointPos = input_attr->dfp.fl;
|
||||
if (srcFixPointPos >= 0)
|
||||
{
|
||||
input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
|
||||
}
|
||||
else if (srcFixPointPos < 0)
|
||||
{
|
||||
input_scale = (float)((int64_t)1 << -srcFixPointPos);
|
||||
}
|
||||
inputZP = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = 1.0f;
|
||||
inputZP = 0;
|
||||
}
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
|
||||
{
|
||||
output_scale = 1.0f / output_attr->asymm.scale;
|
||||
outputZP = output_attr->asymm.zero_point;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
|
||||
{
|
||||
dstFixPointPos = output_attr->dfp.fl;
|
||||
if (dstFixPointPos >= 0)
|
||||
{
|
||||
output_scale = (float) ((int64_t)1 << dstFixPointPos);
|
||||
}
|
||||
else if (dstFixPointPos < 0)
|
||||
{
|
||||
output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
|
||||
}
|
||||
outputZP = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = 1.0;
|
||||
outputZP = 0;
|
||||
}
|
||||
|
||||
if (F16 == input_dtype && F16 == output_dtype)
|
||||
|
|
|
|||
|
|
@ -122,12 +122,16 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
|
|||
PACK_KERNEL_MAP_DOWN(I16, I16),
|
||||
PACK_KERNEL_MAP_DOWN(U8, F16),
|
||||
PACK_KERNEL_MAP_DOWN(U8, U8),
|
||||
PACK_KERNEL_MAP_DOWN(U16, F16),
|
||||
PACK_KERNEL_MAP_DOWN(U16, U16),
|
||||
PACK_KERNEL_MAP_DOWN(F16, F16),
|
||||
PACK_KERNEL_MAP_DOWN(F16, U8),
|
||||
PACK_KERNEL_MAP_DOWN(F16, U16),
|
||||
PACK_KERNEL_MAP_DOWN(BF16, BF16),
|
||||
PACK_KERNEL_MAP_UP(I8, I8),
|
||||
PACK_KERNEL_MAP_UP(I16, I16),
|
||||
PACK_KERNEL_MAP_UP(U8, U8),
|
||||
PACK_KERNEL_MAP_UP(U16, U16),
|
||||
PACK_KERNEL_MAP_UP(F16, F16),
|
||||
PACK_KERNEL_MAP_UP(BF16, BF16),
|
||||
PACK_KERNEL_MAP_UP_OPT(U8, U8),
|
||||
|
|
@ -223,8 +227,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
int32_t half_pixel_centers;
|
||||
|
||||
uint32_t depth = 0;
|
||||
int32_t srcFixPointPos = 0;
|
||||
int32_t dstFixPointPos = 0;
|
||||
float input_scale = 1.0;
|
||||
int32_t inputZP = 0;
|
||||
float output_scale = 1.0;
|
||||
|
|
@ -285,201 +287,16 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
half_pixel_value = 0.0f;
|
||||
}
|
||||
|
||||
if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
|
||||
{
|
||||
input_scale = input_attr->asymm.scale;
|
||||
inputZP = input_attr->asymm.zero_point;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
{
|
||||
srcFixPointPos = input_attr->dfp.fl;
|
||||
if (srcFixPointPos >= 0)
|
||||
{
|
||||
input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
|
||||
}
|
||||
else if (srcFixPointPos < 0)
|
||||
{
|
||||
input_scale = (float)((int64_t)1 << -srcFixPointPos);
|
||||
}
|
||||
inputZP = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = 1.0f;
|
||||
inputZP = 0;
|
||||
}
|
||||
|
||||
if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
|
||||
{
|
||||
output_scale = output_attr->asymm.scale;
|
||||
outputZP = output_attr->asymm.zero_point;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
|
||||
{
|
||||
dstFixPointPos = output_attr->dfp.fl;
|
||||
if (dstFixPointPos >= 0)
|
||||
{
|
||||
output_scale = (float) ((int64_t)1 << dstFixPointPos);
|
||||
}
|
||||
else if (dstFixPointPos < 0)
|
||||
{
|
||||
output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
|
||||
}
|
||||
outputZP = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = 1.0;
|
||||
outputZP = 0;
|
||||
}
|
||||
input_scale = input_attr->scale;
|
||||
inputZP = input_attr->zero_point;
|
||||
output_scale = output_attr->scale;
|
||||
outputZP = output_attr->zero_point;
|
||||
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
{
|
||||
float dfpScale = input_scale * output_scale;
|
||||
gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000300, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtact8Bit_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniRightSubLeft_4x4 = {{
|
||||
0x09090909, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00230001, 0x00670045, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniDFPtoFp32_left_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00020000, 0x00060004, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
if (I8 == input_dtype && I8 == output_dtype && out_width > in_width)
|
||||
{
|
||||
gpu_dp_inst_t uniConvertI32toI16_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniGetMaskShift_2x8 = {{
|
||||
0x99999999, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{
|
||||
0x09090909, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00150004, 0x00370026, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000300, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
|
||||
&uniConvertDFP2FP32_part1_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
gpu_param.global_scale[2] = depth;
|
||||
}
|
||||
else if (I16 == input_dtype && I16 == output_dtype && out_width > in_width)
|
||||
{
|
||||
gpu_dp_inst_t uniConvertI32toI16_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniGetMaskShift_2x8 = {{
|
||||
0x99999999, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{
|
||||
0x09090909, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00150004, 0x00370026, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000300, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
|
||||
&uniConvertDFP2FP32_part1_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
gpu_param.global_scale[2] = depth;
|
||||
}
|
||||
else
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniDFPtoFp32_left_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "dfpScale", &dfpScale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (U8 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
|
||||
if ((U8 == input_dtype || U16 == input_dtype || I8 == input_dtype || I16 == input_dtype))
|
||||
{
|
||||
float uint8Scale = input_scale / output_scale;
|
||||
float uint8ZP_out = (float)outputZP;
|
||||
|
|
@ -615,7 +432,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
|
||||
else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype || U16 == output_dtype))
|
||||
{
|
||||
float uint8Scale = 1.0f / output_scale;
|
||||
float uint8ZP_out = (float)outputZP;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,453 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define _RESIZE_CUBIC_KERNEL_SOURCE() "resize_cubic"
|
||||
|
||||
#define STR(a) #a
|
||||
// Add kernel hashtable here
|
||||
#define RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
|
||||
(( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) )
|
||||
|
||||
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_cubic_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
_RESIZE_CUBIC_KERNEL_SOURCE() }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _resize_cubic_kernel_map[] =
|
||||
{
|
||||
PACK_KERNEL_MAP(F16, F16),
|
||||
PACK_KERNEL_MAP(I16, I16),
|
||||
PACK_KERNEL_MAP(F16, I16),
|
||||
PACK_KERNEL_MAP(I16, F16),
|
||||
PACK_KERNEL_MAP(I8, I8),
|
||||
PACK_KERNEL_MAP(F16, I8),
|
||||
PACK_KERNEL_MAP(I8, F16),
|
||||
PACK_KERNEL_MAP(U8, U8),
|
||||
PACK_KERNEL_MAP(F16, U8),
|
||||
PACK_KERNEL_MAP(U8, F16),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _resize_cubic_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
#define RESIZE_CUBIC_NUM _cnt_of_array( _resize_cubic_kernel_param_def )
|
||||
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_resize_cubic_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t *input_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
||||
float input_scale = 1.0;
|
||||
float input_tail = 0;
|
||||
float output_scale = 1.0;
|
||||
float output_tail = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0]);
|
||||
CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1]);
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
out_shape = output_attr->shape;
|
||||
|
||||
if ( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = input_attr->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input_scale = input_attr->asymm.scale;
|
||||
input_tail = 0 - input_scale * (float)input_attr->asymm.zero_point;
|
||||
}
|
||||
|
||||
if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = output_attr->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
output_scale = (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = 1.0f / (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
output_scale = 1.0f / output_attr->asymm.scale;
|
||||
output_tail = (float)output_attr->asymm.zero_point;
|
||||
}
|
||||
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniFp16ToFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_dp_inst_t uniExtract8Bit_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniFp16ToFp32_4x4", &uniFp16ToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8);
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_tail", &input_tail);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_tail", &output_tail);
|
||||
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (
|
||||
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
|
||||
if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
|
||||
return status;
|
||||
} /* _resize_cubic_initializer() */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _resize_cubic_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _resize_cubic_kernel_map );
|
||||
vx_param_description_t * param_def = _resize_cubic_kernel_param_def;
|
||||
size_t param_def_size = RESIZE_CUBIC_NUM;
|
||||
vx_kernel_initialize_f initializer = _resize_cubic_initializer;
|
||||
|
||||
uint32_t key = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = RESIZE_CUBIC_HASH_KEY( in_dtype, out_dtype );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (uint32_t)param_def_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_tensor_t* _create_scale_tensor
|
||||
(
|
||||
vsi_nn_graph_t *graph,
|
||||
vsi_size_t output_size,
|
||||
float scale_factor,
|
||||
float half_pixel_value,
|
||||
vsi_nn_tensor_t** index
|
||||
)
|
||||
{
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_tensor_t* scale = NULL;
|
||||
vsi_size_t i = 0;
|
||||
float *scale_data_ptr = NULL;
|
||||
int *index_data_ptr = NULL;
|
||||
float scale_value = 0;
|
||||
vsi_ssize_t data = 0;
|
||||
int idx = 0;
|
||||
float delta_v = 0;
|
||||
float cubic_coeff_a = -0.5f;
|
||||
vsi_size_t item_count = 4 * output_size;
|
||||
scale_data_ptr = (float *)malloc(item_count * sizeof(float));
|
||||
if (scale_data_ptr == NULL)
|
||||
{
|
||||
VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__);
|
||||
goto OnError;
|
||||
}
|
||||
|
||||
index_data_ptr = (int *)malloc(output_size * sizeof(int));
|
||||
if (index_data_ptr == NULL)
|
||||
{
|
||||
VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__);
|
||||
goto OnError;
|
||||
}
|
||||
|
||||
for (i = 0; i < output_size; i ++)
|
||||
{
|
||||
scale_value = ((float)i + half_pixel_value) * scale_factor - half_pixel_value;
|
||||
data = (vsi_ssize_t)scale_value;
|
||||
delta_v = scale_value - (float)data;
|
||||
idx = (int)data - 1;
|
||||
|
||||
index_data_ptr[i] = idx;
|
||||
scale_data_ptr[i * 4 + 0] = cubic_coeff_a * (((delta_v - 4) * (delta_v + 1) + 8) * (delta_v + 1) - 4);
|
||||
scale_data_ptr[i * 4 + 1] = ((cubic_coeff_a + 2) * delta_v - (cubic_coeff_a + 3)) * delta_v *delta_v + 1;
|
||||
scale_data_ptr[i * 4 + 2] = ((cubic_coeff_a + 2) * (1 - delta_v) - (cubic_coeff_a + 3))
|
||||
* (1 - delta_v) * (1 - delta_v) + 1;
|
||||
scale_data_ptr[i * 4 + 3] = cubic_coeff_a * ((( 2 - delta_v - 5) * (2 - delta_v) + 8) * (2 - delta_v) - 4);
|
||||
}
|
||||
attr.size[0] = item_count;
|
||||
attr.dim_num = 1;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
attr.vtl = FALSE;
|
||||
|
||||
scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr);
|
||||
if (scale_data_ptr)
|
||||
{
|
||||
free (scale_data_ptr);
|
||||
scale_data_ptr = NULL;
|
||||
}
|
||||
|
||||
attr.size[0] = output_size;
|
||||
attr.dim_num = 1;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
|
||||
attr.vtl = FALSE;
|
||||
|
||||
*index = vsi_nn_CreateTensorFromData(graph, (uint8_t *)index_data_ptr, &attr);
|
||||
if (index_data_ptr)
|
||||
{
|
||||
free (index_data_ptr);
|
||||
index_data_ptr = NULL;
|
||||
}
|
||||
|
||||
OnError:
|
||||
return scale;
|
||||
}
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[RESIZE_CUBIC_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
|
||||
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
|
||||
vsi_size_t in_width = inputs[0]->attr.size[0];
|
||||
vsi_size_t in_height = inputs[0]->attr.size[1];
|
||||
vsi_size_t out_width = outputs[0]->attr.size[0];
|
||||
vsi_size_t out_height = outputs[0]->attr.size[1];
|
||||
float half_pixel_value = 0.0f;
|
||||
float width_scale = 0.0f;
|
||||
float height_scale = 0.0f;
|
||||
vsi_nn_tensor_t* scale_w = NULL;
|
||||
vsi_nn_tensor_t* scale_h = NULL;
|
||||
vsi_nn_tensor_t* index_w = NULL;
|
||||
vsi_nn_tensor_t* index_h = NULL;
|
||||
|
||||
if (align_corners && out_width > 1)
|
||||
{
|
||||
width_scale = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
width_scale = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
|
||||
}
|
||||
|
||||
if (align_corners && out_height > 1)
|
||||
{
|
||||
height_scale = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
height_scale = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
|
||||
}
|
||||
|
||||
if (half_pixel_centers)
|
||||
{
|
||||
half_pixel_value = 0.5f;
|
||||
}
|
||||
else
|
||||
{
|
||||
half_pixel_value = 0.0f;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
size_t node_params_num = RESIZE_CUBIC_NUM;
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, RESIZE_CUBIC_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
scale_w = _create_scale_tensor(graph, out_width,\
|
||||
width_scale, half_pixel_value, &index_w);
|
||||
CHECK_PTR_FAIL_GOTO( scale_w, "Create buffer fail.", final );
|
||||
CHECK_PTR_FAIL_GOTO( index_w, "Create buffer fail.", final );
|
||||
scale_h = _create_scale_tensor(graph, out_height,\
|
||||
height_scale, half_pixel_value, &index_h);
|
||||
CHECK_PTR_FAIL_GOTO( scale_h, "Create buffer fail.", final );
|
||||
CHECK_PTR_FAIL_GOTO( index_h, "Create buffer fail.", final );
|
||||
node_params[2] = (vsi_nn_kernel_node_param_t)(scale_w->t);
|
||||
node_params[3] = (vsi_nn_kernel_node_param_t)(scale_h->t);
|
||||
node_params[4] = (vsi_nn_kernel_node_param_t)(index_w->t);
|
||||
node_params[5] = (vsi_nn_kernel_node_param_t)(index_h->t);
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
|
||||
VSI_ASSERT( status == VSI_SUCCESS );
|
||||
vsi_nn_kernel_scalar_release( &node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_safe_release_tensor(scale_w);
|
||||
vsi_safe_release_tensor(scale_h);
|
||||
vsi_safe_release_tensor(index_w);
|
||||
vsi_safe_release_tensor(index_h);
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( resize_cubic, _setup )
|
||||
|
|
@ -208,52 +208,19 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
|
|||
half_pixel_value = 0.0f;
|
||||
}
|
||||
|
||||
if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
|
||||
{
|
||||
input_scale = input_attr->asymm.scale;
|
||||
inputZP = input_attr->asymm.zero_point;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
input_scale = input_attr->scale;
|
||||
inputZP = input_attr->zero_point;
|
||||
output_scale = 1.0f / output_attr->scale;
|
||||
outputZP = output_attr->zero_point;
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
{
|
||||
srcFixPointPos = input_attr->dfp.fl;
|
||||
if (srcFixPointPos >= 0)
|
||||
{
|
||||
input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
|
||||
}
|
||||
else if (srcFixPointPos < 0)
|
||||
{
|
||||
input_scale = (float)((int64_t)1 << -srcFixPointPos);
|
||||
}
|
||||
inputZP = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = 1.0f;
|
||||
inputZP = 0;
|
||||
}
|
||||
|
||||
if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
|
||||
{
|
||||
output_scale = 1.0f / output_attr->asymm.scale;
|
||||
outputZP = output_attr->asymm.zero_point;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
|
||||
{
|
||||
dstFixPointPos = output_attr->dfp.fl;
|
||||
if (dstFixPointPos >= 0)
|
||||
{
|
||||
output_scale = (float) ((int64_t)1 << dstFixPointPos);
|
||||
}
|
||||
else if (dstFixPointPos < 0)
|
||||
{
|
||||
output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
|
||||
}
|
||||
outputZP = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = 1.0;
|
||||
outputZP = 0;
|
||||
}
|
||||
|
||||
if (F16 == input_dtype && F16 == output_dtype)
|
||||
|
|
|
|||
|
|
@ -208,10 +208,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
|
|||
height = (int32_t)(attr[2]->shape->data[1]);
|
||||
index_num = (int32_t)(attr[0]->shape->data[1]);
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
output_zp = attr[2]->asymm.zero_point;
|
||||
}
|
||||
output_zp = attr[2]->zero_point;
|
||||
|
||||
if (coord_dim == 3)
|
||||
{
|
||||
|
|
@ -367,10 +364,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer)
|
|||
height = (int32_t)(attr[2]->shape->data[1]);
|
||||
index_num = (int32_t)(attr[0]->shape->data[1]);
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
output_zp = attr[2]->asymm.zero_point;
|
||||
}
|
||||
output_zp = attr[2]->zero_point;
|
||||
|
||||
if (coord_dim == 3)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -382,6 +382,12 @@ static vsi_status check_scatter_nd_update_index_repeat
|
|||
int32_t* mask_buffer = NULL;
|
||||
int32_t mask_len = 0;
|
||||
|
||||
if (indices_num == 1)
|
||||
{
|
||||
isRepeat[0] = 0;
|
||||
return VSI_SUCCESS;
|
||||
}
|
||||
|
||||
if (inputs[1]->attr.is_const == FALSE)
|
||||
{
|
||||
isRepeat[0] = 1;
|
||||
|
|
@ -451,7 +457,7 @@ static vsi_status check_scatter_nd_update_index_repeat
|
|||
else if (mask_buffer[mask_idx] > 0)
|
||||
{
|
||||
isRepeat[0] = 1;
|
||||
status = VSI_FAILURE;
|
||||
status = VSI_SUCCESS;
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,861 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2019 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
typedef enum
|
||||
{
|
||||
NONE = 0,
|
||||
Add,
|
||||
Mul,
|
||||
Max,
|
||||
Min
|
||||
} vsi_scatter_nd_update_type_e;
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define KERNEL_SOURCE_1 "scatter_nd_update_reduction"
|
||||
#define KERNEL_SOURCE_2 "scatter_nd_update_reduction_conv"
|
||||
|
||||
#define HASH_SCATTER_ND_UPDATE_KEY(_in0_type, _in2_type, _out_type, _stage, _op_type) \
|
||||
((_in0_type << 24) | (_in2_type << 16) | (_out_type << 8) | (_stage << 4) | (_op_type))
|
||||
|
||||
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_preprocess_"#SRC0_TYPE)
|
||||
|
||||
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, SRC2_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_"#REDUCTION_TYPE"_"#SRC2_TYPE)
|
||||
|
||||
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_conv_"#DST_TYPE)
|
||||
|
||||
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(IN0_TYPE, SOURCE) \
|
||||
{ HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, 0, 0, 0), \
|
||||
HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(IN0_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(REDUCTION_TYPE, IN2_TYPE, SOURCE) \
|
||||
{ HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, REDUCTION_TYPE), \
|
||||
HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, IN2_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(OUT_TYPE, SOURCE) \
|
||||
{ HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 2, 0), \
|
||||
HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type scatter_nd_update_reduction_preprocess_map[] =
|
||||
{
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F16, KERNEL_SOURCE_1)
|
||||
};
|
||||
|
||||
static const _kernel_map_type scatter_nd_update_reduction_process_map[] =
|
||||
{
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, BF16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, BF16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, BF16, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, BF16, KERNEL_SOURCE_1)
|
||||
};
|
||||
|
||||
static const _kernel_map_type scatter_nd_update_reduction_conv_map[] =
|
||||
{
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(U8, KERNEL_SOURCE_2)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I8, KERNEL_SOURCE_2)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I16, KERNEL_SOURCE_2)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F16, KERNEL_SOURCE_2)
|
||||
TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(BF16, KERNEL_SOURCE_2)
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _scatter_nd_update_preprocess_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
|
||||
static vx_param_description_t _scatter_nd_update_process_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
|
||||
static vx_param_description_t _scatter_nd_update_conv_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
|
||||
#define _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_preprocess_kernel_param_def)
|
||||
#define _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_process_kernel_param_def)
|
||||
#define _SCATTER_ND_UPDATE_CONV_PARAM_NUM _cnt_of_array(_scatter_nd_update_conv_kernel_param_def)
|
||||
|
||||
static vsi_status get_scatter_nd_update_tensor_reshape_size
|
||||
(
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
|
||||
uint32_t block_size,
|
||||
uint32_t coordDim,
|
||||
vsi_size_t strides[VSI_NN_MAX_DIM_NUM],
|
||||
int32_t* newDim
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
uint32_t dims_num = inputs[0]->attr.dim_num;
|
||||
vsi_size_t *input_size = inputs[0]->attr.size;
|
||||
uint32_t i = 0;
|
||||
vsi_size_t elementCnt = 1;
|
||||
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
|
||||
|
||||
newDim[0] = 0;
|
||||
for (i = 0; i < dims_num; ++i)
|
||||
{
|
||||
elementCnt *= input_size[i];
|
||||
}
|
||||
|
||||
for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
|
||||
{
|
||||
sizes[i] = 1;
|
||||
}
|
||||
|
||||
sizes[0] = block_size;
|
||||
sizes[1] = elementCnt / block_size;
|
||||
newDim[0] = 2;
|
||||
|
||||
if (coordDim == 1 && strides) // index shape
|
||||
{
|
||||
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
|
||||
{
|
||||
strides[i] = 0;
|
||||
}
|
||||
}
|
||||
else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides)
|
||||
{
|
||||
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
|
||||
{
|
||||
strides[i] = 0;
|
||||
}
|
||||
|
||||
strides[0] = input_size[dims_num - coordDim];
|
||||
for (i = 1; i < coordDim - 1; i++)
|
||||
{
|
||||
strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i];
|
||||
}
|
||||
}
|
||||
|
||||
#undef VSI_NN_MAX_IMAGE_WIDTH
|
||||
|
||||
return status;
|
||||
} /* _get_EltOP_tensor_reshape_size */
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_scatter_nd_update_preprocess_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
1,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
int32_t width = 0;
|
||||
int32_t element_size = 1;
|
||||
int32_t input_zp0 = 0;
|
||||
float input_scale0 = 1;
|
||||
int32_t i = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
|
||||
{
|
||||
element_size *= (int32_t)attr[0]->shape->data[i];
|
||||
}
|
||||
width = element_size / 8;
|
||||
|
||||
input_zp0 = attr[0]->zero_point;
|
||||
input_scale0 = attr[0]->scale;
|
||||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
|
||||
{
|
||||
input_scale0 = 1.0f;
|
||||
}
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
if (element_size < 8)
|
||||
{
|
||||
gpu_param.global_size[0] = element_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_size[0] = width;
|
||||
}
|
||||
gpu_param.global_size[1] = 1;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
|
||||
0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvert2ndU8SubZpToFp32_4x4 = {{
|
||||
0x09090909, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000300, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvert2ndU8SubZpToFp32_4x4", &uniConvert2ndU8SubZpToFp32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale0 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &input_zp0 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
}
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _scatter_nd_update_preprocess_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_scatter_nd_update_process_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
2,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
|
||||
int32_t block_size = 1;
|
||||
int32_t update_width = 1;
|
||||
int32_t index_num = 1;
|
||||
int32_t width = 0;
|
||||
int32_t coord_dim = 0;
|
||||
int32_t strides[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t coord_strides[8] = {0};
|
||||
int32_t coord_strides1[4] = {0};
|
||||
int32_t input_zp2 = 0;
|
||||
float input_scale2 = 1;
|
||||
int32_t i = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &strides[0]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &strides[1]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &strides[2]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &strides[3]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &strides[4]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &strides[5]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &strides[6]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &coord_dim);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
block_size = (int32_t)(attr[2]->shape->data[0]);
|
||||
update_width = (int32_t)(attr[1]->shape->data[0]);
|
||||
index_num = (int32_t)(attr[0]->shape->data[1]);
|
||||
width = block_size;
|
||||
|
||||
input_zp2 = attr[1]->zero_point;
|
||||
input_scale2 = attr[1]->scale;
|
||||
|
||||
coord_strides[coord_dim - 1] = 1;
|
||||
for (i = 0; i < coord_dim - 1; i++)
|
||||
{
|
||||
coord_strides[i] = strides[coord_dim - 2 - i];
|
||||
}
|
||||
memcpy(coord_strides1, coord_strides + 4, 4 * sizeof(int32_t));
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = width;
|
||||
gpu_param.global_size[1] = index_num;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
|
||||
0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride", &coord_strides );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride1", &coord_strides1 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
if (attr[1]->dtype == U8 || attr[1]->dtype == I8 || attr[1]->dtype == I16)
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "update_scale", &input_scale2 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "update_zp", &input_zp2 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
else if (attr[1]->dtype == BF16)
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
}
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
if (attr[2])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[2] );
|
||||
attr[2] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _scatter_nd_update_process_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_scatter_nd_update_conv_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
1,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
int32_t width = 0;
|
||||
int32_t element_size = 1;
|
||||
int32_t i = 0;
|
||||
float output_zp = 0;
|
||||
float output_scale = 1.0f;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
output_zp = (float)attr[0]->zero_point;
|
||||
output_scale = (float)1.0 / attr[0]->scale;
|
||||
|
||||
for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
|
||||
{
|
||||
element_size *= (int32_t)attr[0]->shape->data[i];
|
||||
}
|
||||
width = element_size / 8;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
if (element_size < 8)
|
||||
{
|
||||
gpu_param.global_size[0] = element_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_size[0] = width;
|
||||
}
|
||||
gpu_param.global_size[1] = 1;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
}
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _scatter_nd_update_conv_initializer() */
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel_preprocess,
|
||||
vsi_nn_kernel_t* kernel_process,
|
||||
vsi_nn_kernel_t* kernel_conv,
|
||||
int32_t reduction_flg
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e input2_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
uint32_t key = 0;
|
||||
size_t i = 0;
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_SCATTER_ND_UPDATE_KEY(input0_dtype, 0, 0, 0, 0);
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map); i ++ )
|
||||
{
|
||||
if ( scatter_nd_update_reduction_preprocess_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map) )
|
||||
{
|
||||
snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s",
|
||||
scatter_nd_update_reduction_preprocess_map[i].function_name );
|
||||
kernel_preprocess->info.parameters = _scatter_nd_update_preprocess_kernel_param_def;
|
||||
kernel_preprocess->info.numParams = _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM;
|
||||
kernel_preprocess->info.initialize = _scatter_nd_update_preprocess_initializer;
|
||||
|
||||
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
scatter_nd_update_reduction_preprocess_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
scatter_nd_update_reduction_preprocess_map[i].source_name );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
|
||||
key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, reduction_flg);
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_process_map); i ++ )
|
||||
{
|
||||
if ( scatter_nd_update_reduction_process_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(scatter_nd_update_reduction_process_map) )
|
||||
{
|
||||
snprintf( kernel_process->info.name, VX_MAX_KERNEL_NAME, "%s",
|
||||
scatter_nd_update_reduction_process_map[i].function_name );
|
||||
kernel_process->info.parameters = _scatter_nd_update_process_kernel_param_def;
|
||||
kernel_process->info.numParams = _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM;
|
||||
kernel_process->info.initialize = _scatter_nd_update_process_initializer;
|
||||
|
||||
vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
scatter_nd_update_reduction_process_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
scatter_nd_update_reduction_process_map[i].source_name );
|
||||
}
|
||||
else
|
||||
{
|
||||
status |= VSI_FAILURE;
|
||||
}
|
||||
|
||||
key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 2, 0);
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_conv_map); i ++ )
|
||||
{
|
||||
if ( scatter_nd_update_reduction_conv_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(scatter_nd_update_reduction_conv_map) )
|
||||
{
|
||||
snprintf( kernel_conv->info.name, VX_MAX_KERNEL_NAME, "%s",
|
||||
scatter_nd_update_reduction_conv_map[i].function_name );
|
||||
kernel_conv->info.parameters = _scatter_nd_update_conv_kernel_param_def;
|
||||
kernel_conv->info.numParams = _SCATTER_ND_UPDATE_CONV_PARAM_NUM;
|
||||
kernel_conv->info.initialize = _scatter_nd_update_conv_initializer;
|
||||
|
||||
vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
scatter_nd_update_reduction_conv_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
scatter_nd_update_reduction_conv_map[i].source_name );
|
||||
}
|
||||
else
|
||||
{
|
||||
status |= VSI_FAILURE;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
vsi_size_t strides[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
|
||||
int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
|
||||
int32_t reduction = vsi_nn_kernel_param_get_int32( params, "reduction" );
|
||||
int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
|
||||
int32_t i = 0;
|
||||
vsi_nn_tensor_t * tensors[2] = { NULL };
|
||||
vsi_nn_kernel_t * ikernels[2] = { NULL };
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
status = get_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0,
|
||||
NULL, &rs_idx_dim);
|
||||
status |= get_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0,
|
||||
NULL, &rs_in_dim);
|
||||
status |= get_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
|
||||
strides, &rs_out_dim);
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
{
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_kernel_node_t preprocess_node = NULL;
|
||||
vsi_nn_kernel_node_t process_node = NULL;
|
||||
vsi_nn_kernel_node_param_t preprocess_params[_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t process_params[_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t conv_params[_SCATTER_ND_UPDATE_CONV_PARAM_NUM] = { NULL };
|
||||
int32_t width = 1;
|
||||
int32_t res = 0;
|
||||
|
||||
ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
|
||||
ikernels[0]->unique_id = kernel->unique_id;
|
||||
ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
|
||||
ikernels[1]->unique_id = kernel->unique_id;
|
||||
|
||||
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.dtype = outputs[0]->attr.dtype;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
|
||||
for (i = 0; i < rs_out_dim; i++)
|
||||
{
|
||||
attr.size[i] = shapes[2][i];
|
||||
width *= (int32_t)shapes[2][i];
|
||||
}
|
||||
attr.dim_num = rs_out_dim;
|
||||
|
||||
res = width % 8;
|
||||
width = (width >> 3) << 3;
|
||||
|
||||
tensors[0] = vsi_nn_CreateTensor( graph, &attr ); // ref'
|
||||
attr.size[0] = 1;
|
||||
attr.size[1] = 1;
|
||||
attr.dim_num = rs_out_dim;
|
||||
tensors[1] = vsi_nn_CreateTensor( graph, &attr ); // link_buffer0
|
||||
|
||||
status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], kernel, reduction);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
// convert ref to float
|
||||
preprocess_node = vsi_nn_kernel_create_node( graph, ikernels[0] );
|
||||
if (preprocess_node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
/* Pass parameters to node. */
|
||||
preprocess_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim );
|
||||
preprocess_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
|
||||
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
|
||||
status = vsi_nn_kernel_node_pass_param( preprocess_node, preprocess_params,
|
||||
_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_tensor_release( &preprocess_params[0] );
|
||||
vsi_nn_kernel_scalar_release( &preprocess_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &preprocess_params[3] );
|
||||
}
|
||||
|
||||
// update
|
||||
process_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
|
||||
if (process_node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
/* Pass parameters to node. */
|
||||
process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim );
|
||||
process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim );
|
||||
process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
|
||||
process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[3] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[4] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[5] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[6] );
|
||||
process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
|
||||
status = vsi_nn_kernel_node_pass_param( process_node, process_params,
|
||||
_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_tensor_release( &process_params[0] );
|
||||
vsi_nn_kernel_tensor_release( &process_params[1] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &process_params[11] );
|
||||
}
|
||||
|
||||
// convert float to output
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 0;
|
||||
/* Pass parameters to node. */
|
||||
conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
|
||||
conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
|
||||
conv_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
|
||||
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
|
||||
status = vsi_nn_kernel_node_pass_param( node, conv_params, _SCATTER_ND_UPDATE_CONV_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_tensor_release( &conv_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &conv_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &conv_params[4] );
|
||||
}
|
||||
}
|
||||
|
||||
if (preprocess_node) {vsi_nn_kernel_node_release( &preprocess_node );}
|
||||
if (process_node) {vsi_nn_kernel_node_release( &process_node );}
|
||||
}
|
||||
|
||||
final:
|
||||
if (ikernels[0])
|
||||
{
|
||||
vsi_nn_kernel_release(&ikernels[0]);
|
||||
}
|
||||
if (ikernels[1])
|
||||
{
|
||||
vsi_nn_kernel_release(&ikernels[1]);
|
||||
}
|
||||
vsi_safe_release_tensor(tensors[0]);
|
||||
vsi_safe_release_tensor(tensors[1]);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( scatter_nd_update_reduction, _setup )
|
||||
|
|
@ -22,6 +22,7 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#if !(VX_TENSOR_SELECT_VX_SUPPORT)
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -159,7 +160,6 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
|
|||
vsi_nn_kernel_tensor_attr_t *input1_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
|
||||
vsi_size_array_t *output_shape = NULL;
|
||||
int32_t input0_fl = 0, input1_fl = 0, output_fl = 0;
|
||||
float input0Scale = 1.0f;
|
||||
int32_t input0Zp = 0;
|
||||
float input1Scale = 1.0f;
|
||||
|
|
@ -180,59 +180,12 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
|
|||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
input0_fl = input0_attr->dfp.fl;
|
||||
if (input0_fl > 0)
|
||||
{
|
||||
input0Scale = 1.0f / (float) ((int64_t)1 << input0_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input0Scale = (float)((int64_t)1 << -input0_fl);
|
||||
}
|
||||
}
|
||||
else if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input0Scale = input0_attr->asymm.scale;
|
||||
input0Zp = input0_attr->asymm.zero_point;
|
||||
}
|
||||
|
||||
if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
input1_fl = input1_attr->dfp.fl;
|
||||
if (input1_fl > 0)
|
||||
{
|
||||
input1Scale = 1.0f / (float) ((int64_t)1 << input1_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input1Scale = (float)((int64_t)1 << -input1_fl);
|
||||
}
|
||||
}
|
||||
else if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input1Scale = input1_attr->asymm.scale;
|
||||
input1Zp = input1_attr->asymm.zero_point;
|
||||
}
|
||||
|
||||
if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
output_fl = output_attr->dfp.fl;
|
||||
if (output_fl > 0)
|
||||
{
|
||||
outputScale = 1.0f / (float) ((int64_t)1 << output_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = (float)((int64_t)1 << -output_fl);
|
||||
}
|
||||
}
|
||||
else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
outputScale = output_attr->asymm.scale;
|
||||
outputZP = output_attr->asymm.zero_point;
|
||||
}
|
||||
input0Scale = input0_attr->scale;
|
||||
input0Zp = input0_attr->zero_point;
|
||||
input1Scale = input1_attr->scale;
|
||||
input1Zp = input1_attr->zero_point;
|
||||
outputScale = output_attr->scale;
|
||||
outputZP = output_attr->zero_point;
|
||||
|
||||
gpu_quantize_multiplier_16bit(input0Scale / outputScale, &in0_M0, &in0_postShift);
|
||||
gpu_quantize_multiplier_16bit(input1Scale / outputScale, &in1_M0, &in1_postShift);
|
||||
|
|
@ -541,3 +494,4 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( select, _setup )
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -131,42 +131,10 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
|
|||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
input_zp = attr[0]->asymm.zero_point;
|
||||
scaleIn = attr[0]->asymm.scale;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
input_zp = 0;
|
||||
}
|
||||
|
||||
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
output_zp = attr[1]->asymm.zero_point;
|
||||
scaleOut = 1.0f / attr[1]->asymm.scale;
|
||||
}
|
||||
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[1]->dfp.fl > 0)
|
||||
{
|
||||
scaleOut = (float)((int64_t)1 << attr[1]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
scaleOut = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
|
||||
}
|
||||
output_zp = 0;
|
||||
}
|
||||
input_zp = attr[0]->zero_point;
|
||||
scaleIn = attr[0]->scale;
|
||||
output_zp = attr[1]->zero_point;
|
||||
scaleOut = 1.0f / attr[1]->scale;
|
||||
|
||||
outputVal1 = scaleOut + (float)output_zp;
|
||||
|
||||
|
|
|
|||
|
|
@ -157,8 +157,6 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
|
|||
float scaleOut = 1.0f;
|
||||
int32_t output_ZP = 0;
|
||||
int32_t input_ZP = 0;
|
||||
int32_t srcFixPointPos = 0;
|
||||
int32_t dstFixPointPos = 0;
|
||||
int32_t is_samefl = 0;
|
||||
uint32_t pack_key = 0;
|
||||
|
||||
|
|
@ -178,41 +176,10 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
|
|||
|
||||
pack_key = _PACK_SLICE_KEY( input_dtype, output_dtype, is_samefl);
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
{
|
||||
srcFixPointPos = input_attr->dfp.fl;
|
||||
if (srcFixPointPos > 0)
|
||||
{
|
||||
scaleIn = (1.0f / ((float) ((int64_t)1 << srcFixPointPos)));
|
||||
}
|
||||
else
|
||||
{
|
||||
scaleIn = ((float) ((int64_t)1 << -srcFixPointPos));
|
||||
}
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
|
||||
{
|
||||
input_ZP = input_attr->asymm.zero_point;
|
||||
scaleIn = input_attr->asymm.scale;
|
||||
}
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
|
||||
{
|
||||
dstFixPointPos = output_attr->dfp.fl;
|
||||
if (dstFixPointPos > 0)
|
||||
{
|
||||
scaleOut = (1.0f / ((float) ((int64_t)1 << dstFixPointPos)));
|
||||
}
|
||||
else
|
||||
{
|
||||
scaleOut = ((float) ((int64_t)1 << -dstFixPointPos));
|
||||
}
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
|
||||
{
|
||||
output_ZP = output_attr->asymm.zero_point;
|
||||
scaleOut = output_attr->asymm.scale;
|
||||
}
|
||||
input_ZP = input_attr->zero_point;
|
||||
scaleIn = input_attr->scale;
|
||||
output_ZP = output_attr->zero_point;
|
||||
scaleOut = output_attr->scale;
|
||||
|
||||
if ((I8 == input_dtype && input_dtype == output_dtype ) ||
|
||||
(U8 == input_dtype && input_dtype == output_dtype ) )
|
||||
|
|
|
|||
|
|
@ -170,23 +170,8 @@ DEF_KERNEL_INITIALIZER(_get_matrix_initializer)
|
|||
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
if ( attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input_scale = attr->asymm.scale;
|
||||
input_tail = 0 - attr->asymm.zero_point * input_scale;
|
||||
}
|
||||
input_scale = attr->scale;
|
||||
input_tail = 0 - attr->zero_point * input_scale;
|
||||
|
||||
in_shape = attr->shape;
|
||||
|
||||
|
|
@ -265,42 +250,10 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
|
|||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[0]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input_scale = attr[0]->asymm.scale;
|
||||
input_tail = 0 - attr[0]->asymm.zero_point * input_scale;
|
||||
}
|
||||
|
||||
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
int32_t fl = attr[1]->dfp.fl;
|
||||
|
||||
if (fl >= 0)
|
||||
{
|
||||
output_scale = (vx_float32) ((vx_int64)1 << fl);
|
||||
}
|
||||
else if (fl < 0)
|
||||
{
|
||||
output_scale = 1.0f / (vx_float32) ((vx_int64)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
output_scale = 1.0f / attr[1]->asymm.scale;
|
||||
output_zp = (float)attr[1]->asymm.zero_point;
|
||||
}
|
||||
input_scale = attr[0]->scale;
|
||||
input_tail = 0 - attr[0]->zero_point * input_scale;
|
||||
output_scale = 1.0f / attr[1]->scale;
|
||||
output_zp = (float)attr[1]->zero_point;
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
|
||||
|
|
|
|||
|
|
@ -166,8 +166,6 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
|
|||
|
||||
vx_tensor input = (vx_tensor)param[0];
|
||||
vx_tensor output = (vx_tensor)param[1];
|
||||
int8_t srcFixPointPos = 0;
|
||||
int8_t dstFixPointPos = 0;
|
||||
vx_float32 inputTail = 0;
|
||||
vx_float32 inputScale = 1.0f;
|
||||
vx_float32 outputZP = 0;
|
||||
|
|
@ -186,42 +184,11 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
|
|||
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
out_shape = output_attr->shape;
|
||||
inputScale = input_attr->scale;
|
||||
inputTail = 0 - (vx_float32)input_attr->zero_point * inputScale;
|
||||
outputScale = 1.0f / output_attr->scale;
|
||||
outputZP = (vx_float32)(output_attr->zero_point);
|
||||
|
||||
if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
srcFixPointPos = (int8_t)input_attr->dfp.fl;
|
||||
if (srcFixPointPos > 0)
|
||||
{
|
||||
inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos);
|
||||
}
|
||||
}
|
||||
else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
inputScale = input_attr->asymm.scale;
|
||||
inputTail = 0 - input_attr->asymm.zero_point * inputScale;
|
||||
}
|
||||
|
||||
if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
dstFixPointPos = (int8_t)output_attr->dfp.fl;
|
||||
if (dstFixPointPos > 0)
|
||||
{
|
||||
outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
|
||||
}
|
||||
}
|
||||
else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
outputScale = 1.0f / output_attr->asymm.scale;
|
||||
outputZP = (vx_float32)(output_attr->asymm.zero_point);
|
||||
}
|
||||
#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \
|
||||
(IN_TYPE | ( OUT_TYPE << 16))
|
||||
|
||||
|
|
@ -379,8 +346,6 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)
|
|||
|
||||
vx_tensor input = (vx_tensor)param[0];
|
||||
vx_tensor output = (vx_tensor)param[1];
|
||||
int8_t srcFixPointPos = 0;
|
||||
int8_t dstFixPointPos = 0;
|
||||
vx_float32 inputTail = 0;
|
||||
vx_float32 inputScale = 1.0f;
|
||||
vx_float32 outputZP = 0;
|
||||
|
|
@ -398,42 +363,11 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)
|
|||
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
out_shape = output_attr->shape;
|
||||
inputScale = input_attr->scale;
|
||||
inputTail = 0 - (vx_float32)input_attr->zero_point * inputScale;
|
||||
outputScale = 1.0f / output_attr->scale;
|
||||
outputZP = (vx_float32)(output_attr->zero_point);
|
||||
|
||||
if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
srcFixPointPos = (int8_t)input_attr->dfp.fl;
|
||||
if (srcFixPointPos > 0)
|
||||
{
|
||||
inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos);
|
||||
}
|
||||
}
|
||||
else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
inputScale = input_attr->asymm.scale;
|
||||
inputTail = 0 - input_attr->asymm.zero_point * inputScale;
|
||||
}
|
||||
|
||||
if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
dstFixPointPos = (int8_t)output_attr->dfp.fl;
|
||||
if (dstFixPointPos > 0)
|
||||
{
|
||||
outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
|
||||
}
|
||||
}
|
||||
else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
outputScale = 1.0f / output_attr->asymm.scale;
|
||||
outputZP = (vx_float32)(output_attr->asymm.zero_point);
|
||||
}
|
||||
#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \
|
||||
(IN_TYPE | ( OUT_TYPE << 16))
|
||||
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue