diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f105a65 --- /dev/null +++ b/.gitignore @@ -0,0 +1,335 @@ +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.suo +*.user +*.userosscache +*.sln.docstates + +*-[Dd]ebug/ +*-[Dd]ebugPublic/ +*-[Rr]elease/ +*-[Rr]eleases/ + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Build results +*.o +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +NNApi0.3/ +NNApi0.4/ +OpenVX1.2/ +bazel-bin +bazel-genfiles +bazel-out +bazel-ovxlib +bazel-testlogs + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# VS code +.vscode + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUNIT +*.VisualState.xml +TestResult.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ +**/Properties/launchSettings.json + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_i.h +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# JustCode is a .NET coding add-in +.JustCode + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# TypeScript v1 declaration files +typings/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# JetBrains Rider +.idea/ +*.sln.iml + +# CodeRush +.cr/ + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# IDE +.settings/ diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index 0a5077b..88c74c2 100644 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -144,3 +144,5 @@ DEF_OP(PRE_PROCESS_YUV444) DEF_OP(PRE_PROCESS_NV12) DEF_OP(SCATTER_ND) DEF_OP(DECONVOLUTION1D) +DEF_OP(INTERP) +DEF_OP(RESIZE_1D) diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def index 2a1ac9e..e8f677b 100644 --- a/src/tim/vx/internal/include/internal/internal_ops.def +++ b/src/tim/vx/internal/include/internal/internal_ops.def @@ -14,3 +14,5 @@ DEF_OP(RESIZE_NEAREST_INTERNAL) DEF_OP(DEPTH2SPACE_INTERNAL) DEF_OP(GRUCELL_ACTIVATION_INTERNAL) DEF_OP(GRUCELL_ACTIVATION_INTERNAL_SMA) +DEF_OP(RESIZE_1D_BILINEAR_INTERNAL) +DEF_OP(RESIZE_1D_NEAREST_INTERNAL) diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_elu.h b/src/tim/vx/internal/include/ops/vsi_nn_op_elu.h index 87cf3aa..b4ebd3c 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_elu.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_elu.h @@ -44,6 +44,7 @@ typedef struct _vsi_nn_elu_param { /* elu layer local data structure */ vsi_nn_elu_lcl_data local; + float alpha; } vsi_nn_elu_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h b/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h new file mode 100644 index 0000000..5f1bfb2 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_INTERP_H +#define _VSI_NN_OP_INTERP_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_interp_param +{ + struct _interp_local_data_t* local; + int32_t height; //height of output + int32_t width; //width of output + int32_t zoom_factor; // zoom factor + int32_t shrink_factor; // shrink factor + int32_t pad_beg; //padding at begin of input + int32_t pad_end; //padding at end of intput +} vsi_nn_interp_param; + + + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h new file mode 100644 index 0000000..e85aa74 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_RESIZE_1D_H +#define _VSI_NN_OP_RESIZE_1D_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_resize_1d_param +{ + struct _resize_1d_local_data_t* local; + vsi_enum type; + float factor; + int32_t size[2]; + vsi_bool align_corners; + vsi_bool half_pixel_centers; +} vsi_nn_resize_1d_param; + +_compiler_assert(offsetof(vsi_nn_resize_1d_param, local) == 0, \ + vsi_nn_resize_1d_h ); + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h new file mode 100644 index 0000000..4e119c8 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h @@ -0,0 +1,42 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_RESIZE_1D_BILINEAR_INTERNAL_H +#define _VSI_NN_OP_RESIZE_1D_BILINEAR_INTERNAL_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_resize_1d_bilinear_internal_param +{ + struct _resize_1d_bilinear_internal_local_data_t* local; + vsi_bool align_corners; + vsi_bool half_pixel_centers; + float factor; +} vsi_nn_resize_1d_bilinear_internal_param; + +_compiler_assert(offsetof(vsi_nn_resize_1d_bilinear_internal_param, local) == 0, \ + vsi_nn_resize_1d_bilinear_internal_h ); + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h new file mode 100644 index 0000000..cc94051 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h @@ -0,0 +1,42 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_RESIZE_1D_NEAREST_INTERNAL_H +#define _VSI_NN_OP_RESIZE_1D_NEAREST_INTERNAL_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_resize_1d_nearest_internal_param +{ + struct _resize_1d_nearest_internal_local_data_t* local; + vsi_bool align_corners; + vsi_bool half_pixel_centers; + float factor; +} vsi_nn_resize_1d_nearest_internal_param; + +_compiler_assert(offsetof(vsi_nn_resize_1d_nearest_internal_param, local) == 0, \ + vsi_nn_resize_1d_nearest_internal_h ); + +#endif + diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h index 93180c7..de9d470 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h @@ -195,13 +195,6 @@ OVXLIB_API vsi_bool vsi_nn_CheckFilePath const char *path ); -OVXLIB_API void vsi_nn_GetFP32MultiAndPostShift - ( - vx_float32 mult, - vx_uint16 *M0, - vx_int8 *N - ); - /** * Malloc aligned buffer * Malloc address and size aligned buffer. diff --git a/src/tim/vx/internal/include/vsi_nn_graph_optimization.h b/src/tim/vx/internal/include/vsi_nn_graph_optimization.h index bdf2e5a..1f43353 100644 --- a/src/tim/vx/internal/include/vsi_nn_graph_optimization.h +++ b/src/tim/vx/internal/include/vsi_nn_graph_optimization.h @@ -32,6 +32,13 @@ extern "C" { #endif +vx_tensor vsi_nn_CreateRawTensorFromData + ( + vsi_nn_graph_t * graph, + uint8_t * data, + vsi_nn_tensor_attr_t * attr + ); + vsi_status vsi_nn_OptimizeGraph ( vsi_nn_graph_t* graph, diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index 6304280..9f13725 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -158,6 +158,10 @@ #include "ops/vsi_nn_op_squeeze.h" #include "ops/vsi_nn_op_expand_broadcast.h" #include "ops/vsi_nn_op_deconvolution1d.h" +#include "ops/vsi_nn_op_interp.h" +#include "ops/vsi_nn_op_resize_1d.h" +#include "ops/vsi_nn_op_resize_1d_bilinear_internal.h" +#include "ops/vsi_nn_op_resize_1d_nearest_internal.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" @@ -302,6 +306,10 @@ typedef union _vsi_nn_nn_param vsi_nn_squeeze_param squeeze; vsi_nn_expand_broadcast_param expand_broadcast; vsi_nn_deconvolution1d_param deconvolution1d; + vsi_nn_interp_param interp; + vsi_nn_resize_1d_param resize_1d; + vsi_nn_resize_1d_bilinear_internal_param resize_1d_bilinear_internal; + vsi_nn_resize_1d_nearest_internal_param resize_1d_nearest_internal; uint8_t client_param[128]; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index 5e68dec..da62e48 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 1 -#define VSI_NN_VERSION_PATCH 28 +#define VSI_NN_VERSION_PATCH 30 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/Android.mk b/src/tim/vx/internal/src/Android.mk index 235a845..61345a2 100644 --- a/src/tim/vx/internal/src/Android.mk +++ b/src/tim/vx/internal/src/Android.mk @@ -117,6 +117,7 @@ LOCAL_C_INCLUDES += \ LOCAL_CFLAGS := \ -DLINUX \ -D'OVXLIB_API=__attribute__((visibility("default")))' \ + -DANDROID_SDK_VERSION=$(PLATFORM_SDK_VERSION)\ -Wno-sign-compare \ -Wno-implicit-function-declaration \ -Wno-sometimes-uninitialized \ diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c index b2afa0a..c0de129 100644 --- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c @@ -168,12 +168,14 @@ static vx_param_description_t kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define SCALAR_INPUT_SCALE (2) #define SCALAR_INPUT_TAIL (3) #define SCALAR_OUTPUT_SCALE (4) #define SCALAR_OUTPUT_ZP (5) +#define SCALAR_ALPHA (6) #define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) /* @@ -293,6 +295,7 @@ static vsi_nn_kernel_node_t _setup float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; float outputScale = outputs[0]->attr.dtype.scale; float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); ret = vsi_nn_kernel_optimize_element_shape( (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, @@ -331,6 +334,8 @@ static vsi_nn_kernel_node_t _setup graph, F32, &outputScale ); node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( graph, F32, &outputZP ); + node_params[SCALAR_ALPHA] = vsi_nn_kernel_scalar_create( + graph, F32, &alpha ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); @@ -369,6 +374,11 @@ OnError: vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); } + if (node_params[SCALAR_ALPHA]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALPHA] ); + } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c index 8a72060..d258e39 100644 --- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c @@ -356,7 +356,8 @@ static vsi_nn_kernel_node_t _setup int32_t out_shape[VSI_NN_MAX_DIM_NUM] = {0}; int32_t out_rs_flg = 0; int32_t axis_num = 0; - int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", (size_t*)&axis_num); + size_t axis_num_temp = 0; + int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num_temp); int32_t keep_dim = vsi_nn_kernel_param_get_int32( params, "keep_dim" ); int32_t first_axis = axis[0]; int32_t i = 0; @@ -369,6 +370,8 @@ static vsi_nn_kernel_node_t _setup float input_scale = inputs[0]->attr.dtype.scale; float dim_ratio = (float)1.0 / (float)(width * height); + axis_num = (int32_t)axis_num_temp; + if(inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) { if (inputs[0]->attr.dtype.fl > 0) diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c new file mode 100644 index 0000000..f8ff904 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c @@ -0,0 +1,305 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define _RESIZE_1D_BILINEAR_KERNEL_SOURCE() "resize_1d_bilinear" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_1D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) ) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_1D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.resize_1d_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _RESIZE_1D_BILINEAR_KERNEL_SOURCE() } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_1d_bilinear_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32), + PACK_KERNEL_MAP( U8, U8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _resize_1d_bilinear_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_1D_BILINEAR_PARAM_NUM _cnt_of_array( _resize_1d_bilinear_kernel_param_def ) + + +#define SCALAR_SCALE_X (2) +#define SCALAR_HALF_PIXEL (3) +#define SCALAR_INPUT_SCALE (4) +#define SCALAR_INPUT_TAIL (5) +#define SCALAR_OUTPUT_SCALE (6) +#define SCALAR_OUTPUT_TAIL (7) + + +#define RESIZE_1D_BILINEAR_NUM 4 +#define RESIZE_1D_BILINEAR_QUANT_NUM _cnt_of_array( _resize_1d_bilinear_kernel_param_def ) + + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_int_array_t * out_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _resize_1d_bilinear_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype = F16; + vsi_nn_kernel_dtype_e out_dtype = F16; + const _kernel_map_type * kernel_map = _resize_1d_bilinear_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_1d_bilinear_kernel_map ); + vx_param_description_t * param_def = _resize_1d_bilinear_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_1d_bilinear_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_1d_bilinear_initializer; + + uint32_t key = 0; + uint32_t i = 0; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + if (F16 == out_dtype) + { + out_dtype = F32; + } + + + if ((U8 == in_dtype) || (U8 == out_dtype)) + { + param_def_size = RESIZE_1D_BILINEAR_QUANT_NUM; + *is_use_u8_kernel = TRUE; + } + else + { + param_def_size = RESIZE_1D_BILINEAR_NUM; + *is_use_u8_kernel = FALSE; + } + + key = RESIZE_1D_BILINEAR_HASH_KEY( in_dtype, out_dtype ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[RESIZE_1D_BILINEAR_QUANT_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + int32_t in_width = inputs[0]->attr.size[0]; + int32_t out_width = outputs[0]->attr.size[0]; + float input_zp = (float)inputs[0]->attr.dtype.zero_point; + float input_scale = inputs[0]->attr.dtype.scale; + float input_tail = -(input_zp * input_scale); + float output_zp = (float)outputs[0]->attr.dtype.zero_point; + float output_scale = (0 == outputs[0]->attr.dtype.scale) ? 1.0f : 1.0f / outputs[0]->attr.dtype.scale; + float half_pixel_value = 0.0f; + float scale_factor_x = 0.0f; + vsi_bool is_use_u8_kernel = FALSE; + + if (align_corners && out_width > 1) + { + scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1); + } + else + { + scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + size_t node_params_num = RESIZE_1D_BILINEAR_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, RESIZE_1D_BILINEAR_QUANT_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value ); + if (is_use_u8_kernel) + { + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input_tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &output_zp ); + node_params_num = RESIZE_1D_BILINEAR_QUANT_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] ); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( resize_1d_bilinear, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c new file mode 100644 index 0000000..5b0f9a4 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c @@ -0,0 +1,312 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_RESIZE_1D_NEAREST, +} _internal_kernel_e; + +#define _RESIZE_1D_NEAREST_KERNEL_SOURCE "resize_1d_nearest" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_1D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_1D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.resize_1d_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _RESIZE_1D_NEAREST_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_1d_nearest_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32), + PACK_KERNEL_MAP( U8, U8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _resize_1d_nearest_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_1D_NEAREST_PARAM_NUM 5 +#define _RESIZE_1D_NEAREST_QUANT_NUM _cnt_of_array( _resize_1d_nearest_kernel_param_def ) + +#define SCALAR_SCALE_X (2) +#define SCALAR_HALF_PIXEL (3) +#define SCALAR_ROUND_VALUE (4) +#define SCALAR_SCALE_VALUE (5) +#define SCALAR_TAIL_VALUE (6) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_int_array_t * out_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _resize_1d_nearest_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype = F16; + vsi_nn_kernel_dtype_e out_dtype = F16; + const _kernel_map_type * kernel_map = _resize_1d_nearest_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_1d_nearest_kernel_map ); + vx_param_description_t * param_def = _resize_1d_nearest_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_1d_nearest_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_1d_nearest_initializer; + + uint32_t key = 0; + uint32_t i = 0; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if ((U8 == in_dtype) || (U8 == out_dtype)) + { + param_def_size = _RESIZE_1D_NEAREST_QUANT_NUM; + *is_use_u8_kernel = TRUE; + } + else + { + param_def_size = _RESIZE_1D_NEAREST_PARAM_NUM; + *is_use_u8_kernel = FALSE; + } + + key = RESIZE_1D_NEAREST_HASH_KEY( in_dtype, out_dtype ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_1D_NEAREST_QUANT_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + int32_t in_width = inputs[0]->attr.size[0]; + int32_t out_width = outputs[0]->attr.size[0]; + float input_zp = (float)inputs[0]->attr.dtype.zero_point; + float input_scale = inputs[0]->attr.dtype.scale; + float output_scale = (0 == outputs[0]->attr.dtype.scale) ? \ + input_scale : input_scale / outputs[0]->attr.dtype.scale; + float output_tail = (float)outputs[0]->attr.dtype.zero_point - input_zp * output_scale; + float half_pixel_value = 0.0f; + float round_value = 0.0f; + float scale_factor_x = 0.0f; + vsi_bool is_use_u8_kernel = FALSE; + + if (align_corners && out_width > 1) + { + scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1); + } + else + { + scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width; + } + + if (align_corners) + { + round_value = 0.5f; + } + else + { + round_value = 0.0f; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + size_t node_params_num = _RESIZE_1D_NEAREST_PARAM_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_1D_NEAREST_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value ); + node_params[SCALAR_ROUND_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &round_value ); + if (is_use_u8_kernel) + { + node_params[SCALAR_SCALE_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[SCALAR_TAIL_VALUE] = vsi_nn_kernel_scalar_create(graph, F32, &output_tail ); + node_params_num = _RESIZE_1D_NEAREST_QUANT_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ROUND_VALUE] ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL_VALUE] ); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( resize_1d_nearest, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c index f52f367..03c1711 100644 --- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c @@ -49,7 +49,7 @@ typedef enum } unary_type_e; -#define _CPU_ARG_NUM (1) +#define _CPU_ARG_NUM (2) #define _CPU_INPUT_NUM (1) #define _CPU_OUTPUT_NUM (1) #define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) @@ -71,9 +71,9 @@ static float log_eval(float data) return logf(data); } -static float elu_eval(float data) +static float elu_eval(float data, float alpha) { - return data >=0 ? data : expf(data) - 1; + return data >=0 ? data : expf(data) * alpha - alpha; } static float neg_eval(float data) @@ -114,6 +114,7 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) size_t out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; int32_t i; + float alpha = 0; int32_t unary_type = 0; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; @@ -126,6 +127,8 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &unary_type); CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &alpha); + CHECK_STATUS_FAIL_GOTO(status, final ); buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); @@ -151,7 +154,7 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) data = log_eval(data); break; case UNARY_ELU: - data = elu_eval(data); + data = elu_eval(data, alpha); break; case UNARY_NEG: data = neg_eval(data); @@ -193,9 +196,11 @@ static vx_param_description_t kernel_param_def[] = {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define INPUT_FUNC_TYPE (2) +#define INPUT_SCALAR_ALPHA (3) static const vx_kernel_description_t _kernel_info = { @@ -237,6 +242,7 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_SUCCESS; vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; + float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); status = _query_kernel( inputs, outputs, kernel ); if( VSI_SUCCESS == status) @@ -249,10 +255,13 @@ static vsi_nn_kernel_node_t _setup inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); backend_params[INPUT_FUNC_TYPE] = vsi_nn_kernel_scalar_create( graph, I32, &unary_type ); + backend_params[INPUT_SCALAR_ALPHA] = vsi_nn_kernel_scalar_create( + graph, F32, &alpha ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); vsi_nn_kernel_scalar_release( &backend_params[INPUT_FUNC_TYPE] ); + vsi_nn_kernel_scalar_release( &backend_params[INPUT_SCALAR_ALPHA] ); } else { diff --git a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c index b9450c9..f1124bf 100644 --- a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c @@ -258,11 +258,14 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; int32_t axis_num = 0; - int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", (size_t*)&axis_num); + size_t axis_num_temp = 0; + int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num_temp); vsi_bool is_continue_axis = TRUE; uint32_t mask = 0; int32_t i = 0; + axis_num = (int32_t)axis_num_temp; + for ( i = 1; i < axis_num; i++) { if ( axis[i] != (axis[i - 1] + 1) && axis[0] == 0) diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c new file mode 100644 index 0000000..df91d90 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c @@ -0,0 +1,271 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.resize_1d_bilinear") + + +/* + * Kernel params + */ +static vx_param_description_t _resize_1d_bilinear_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_1D_BILINEAR_PARAM_NUM _cnt_of_array( _resize_1d_bilinear_kernel_param_def ) + +#define SCALAR_ALIGN_CORNERS (2) +#define SCALAR_HALF_PIXEL (3) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i = 0; + int32_t align_corners = 0; + int32_t half_pixel_centers = 0; + float width_scale = 1.0f; + uint32_t input_width = 0, output_width = 0; + uint32_t w = 0, out = 0; + uint32_t output_dims = 0; + float data00 = .0f, data01 = .0f, interpolation = .0f; + uint32_t index = 0; + uint32_t outer = 0; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers)); + input_width = in_attr[0]->shape->data[0]; + output_width = out_attr[0]->shape->data[0]; + output_dims = (uint32_t)out_attr[0]->shape->size; + + if (align_corners && output_width > 1) + { + width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1); + } + else + { + width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width; + } + + outer = 1; + + for (i = 1; i < output_dims; i++) + { + outer = outer * out_attr[0]->shape->data[i]; + } + + for (out = 0; out < outer; out++) + { + vx_int32 input_base = out * input_width; + vx_int32 output_base = out * output_width; + for (w = 0; w < output_width; w ++) + { + vx_float32 input_w; + vx_int32 w0; + vx_int32 w1; + if (half_pixel_centers) + { + input_w = ((vx_float32)w + 0.5f) * width_scale - 0.5f; + } + else + { + input_w = w * width_scale; + } + w0 = (vx_int32)input_w; + w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vx_int32)(input_width - 1)); + index = input_base + w0; + data00 = f32_in_buffer[0][index]; + index = input_base + w1; + data01 = f32_in_buffer[0][index]; + + interpolation = data00 * (1 - (input_w - w0)) + + data01 * (input_w - w0); + index = output_base + w; + f32_out_buffer[0][index] = interpolation; + } + } + + + /* save data */ + for (i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _resize_1d_bilinear_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _resize_1d_bilinear_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_1D_BILINEAR_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_1D_BILINEAR_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_1D_BILINEAR_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( resize_1d_bilinear, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c new file mode 100644 index 0000000..44e45a7 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c @@ -0,0 +1,271 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.resize_1d_nearest") + + +/* + * Kernel params + */ +static vx_param_description_t _resize_1d_nearest_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_1D_NEAREST_PARAM_NUM _cnt_of_array( _resize_1d_nearest_kernel_param_def ) + +#define SCALAR_ALIGN_CORNERS (2) +#define SCALAR_HALF_PIXEL (3) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i = 0; + int32_t align_corners = 0; + int32_t half_pixel_centers = 0; + float width_scale = 1.0f; + uint32_t input_width = 0, output_width = 0; + uint32_t w = 0, out = 0; + uint32_t output_dims = 0; + uint32_t outer = 0; + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers)); + input_width = in_attr[0]->shape->data[0]; + output_width = out_attr[0]->shape->data[0]; + output_dims = (uint32_t)out_attr[0]->shape->size; + + if (align_corners && output_width > 1) + { + width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1); + } + else + { + width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width; + } + + outer = 1; + + for (i = 1; i < output_dims; i++) + { + outer = outer * out_attr[0]->shape->data[i]; + } + + for (out = 0; out < outer; out++) + { + vx_int32 input_base = out * input_width; + vx_int32 output_base = out * output_width; + + for (w = 0; w < output_width; w ++) + { + float input_w; + uint32_t in_x; + int32_t in_index; + int32_t out_index; + + if (half_pixel_centers) + { + input_w = ((float)w + 0.5f) * width_scale; + } + else + { + input_w = w * width_scale; + } + if (align_corners) + { + in_x = vsi_nn_min((uint32_t)simple_round(input_w), input_width - 1); + } + else + { + in_x = vsi_nn_min((uint32_t)floorf(input_w), input_width - 1); + } + in_index = in_x + input_base; + out_index = w + output_base; + f32_out_buffer[0][out_index] = f32_in_buffer[0][in_index]; + } + } + + /* save data */ + for (i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _resize_1d_nearest_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _resize_1d_nearest_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_1D_NEAREST_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_1D_NEAREST_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_1D_NEAREST_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( resize_1d_nearest, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c index 501e860..97aa183 100644 --- a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c @@ -279,10 +279,10 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) if( U8 == input_dtype && F16 == output_dtype ) { - vx_uint16 M0 = 0; - vx_int8 postShift = 0; - vx_uint32 multAndoutZP0[2] = {0}; - vx_uint32 multAndoutZP1[2] = {0}; + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP0[2] = {0}; + uint32_t multAndoutZP1[2] = {0}; gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ 0xdddddddd, // TCfg @@ -305,12 +305,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; - vsi_nn_GetFP32MultiAndPostShift(scaleIn / scaleOut, &M0, &postShift); + gpu_quantize_multiplier_16bit(scaleIn / scaleOut, &M0, &postShift); multAndoutZP0[0] = (vx_uint32)(M0); multAndoutZP0[1] = (vx_uint32)((output_ZP << postShift) - input_ZP * M0); uniU8MulAndPostShift_0_Lo_2x8.data[7] |= (postShift & 0x1F); - vsi_nn_GetFP32MultiAndPostShift(scaleIn1 / scaleOut, &M0, &postShift); + gpu_quantize_multiplier_16bit(scaleIn1 / scaleOut, &M0, &postShift); multAndoutZP1[0] = (vx_uint32)(M0); multAndoutZP1[1] = (vx_uint32)((output_ZP << postShift) - input_ZP1 * M0); uniU8MulAndPostShift_1_Lo_2x8.data[7] |= (postShift & 0x1F); diff --git a/src/tim/vx/internal/src/kernel/evis/clip_evis.c b/src/tim/vx/internal/src/kernel/evis/clip_evis.c index 78d52fc..f0c673a 100644 --- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c @@ -268,7 +268,7 @@ DEF_KERNEL_INITIALIZER(_clip_initializer) { uint32_t multAndoutZP[2] = {0}; uint16_t M0 = 0; - int8_t postShift = 0; + int32_t postShift = 0; gpu_dp_inst_t uniDataMulAndPostShift_2x8 = {{ 0xdddddddd, // TCfg 0x44444444, // ASelt @@ -279,7 +279,7 @@ DEF_KERNEL_INITIALIZER(_clip_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; - vsi_nn_GetFP32MultiAndPostShift(scaleIn / scaleOut, &M0, &postShift); + gpu_quantize_multiplier_16bit(scaleIn / scaleOut, &M0, &postShift); multAndoutZP[0] = (uint32_t)(M0); multAndoutZP[1] = (uint32_t)(output_ZP << postShift ); @@ -434,8 +434,8 @@ DEF_KERNEL_INITIALIZER(_clip_initializer) int32_t packedMaxData[4]; float uint8Scale = scaleIn / scaleOut; uint16_t M0 = 0; - int8_t postShift = 0; - uint32_t multAndoutZP[2] = {0}; + int32_t postShift = 0; + uint32_t multAndoutZP[2] = {0}; gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{ 0xdddddddd, // TCfg 0x44444444, // ASelt @@ -457,7 +457,7 @@ DEF_KERNEL_INITIALIZER(_clip_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; - vsi_nn_GetFP32MultiAndPostShift(uint8Scale, &M0, &postShift); + gpu_quantize_multiplier_16bit(uint8Scale, &M0, &postShift); multAndoutZP[0] = (uint32_t)(M0); multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * M0); diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c index 9c8cbab..2c7c4f6 100644 --- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c @@ -702,27 +702,66 @@ static vsi_nn_kernel_node_t _setup ) { vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t node_params[_DEPTHWISE_CONV1D_PARAM_NUM]; + vsi_nn_kernel_node_param_t node_params[_DEPTHWISE_CONV1D_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; int32_t weight_pad_front[VSI_NN_MAX_DIM_NUM] = {0}; int32_t weight_pad_end[VSI_NN_MAX_DIM_NUM] = {0}; vsi_nn_tensor_t * weights = NULL; vsi_nn_tensor_t * biases = NULL; vsi_nn_tensor_t *temp_tensor[3] = {NULL}; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_rank = 2; + uint32_t i = 0; int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" ); int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" ); int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" ); _internal_kernel_size_e ks = KN; - weight_pad_end[0] = gpu_align_np2_safe(inputs[1]->attr.size[0], 8) - inputs[1]->attr.size[0]; + if (!((VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type) + && (VSI_NN_TYPE_UINT8 == inputs[1]->attr.dtype.vx_type) + && (NULL == inputs[2] || VSI_NN_TYPE_INT32 == inputs[2]->attr.dtype.vx_type) + && (VSI_NN_TYPE_UINT8 == outputs[0]->attr.dtype.vx_type))) + { + return NULL; + } - weights = vsi_nn_pad_tensor(graph, inputs[1], weight_pad_front, weight_pad_end, inputs[1]->attr.dim_num, - VSI_NN_PAD_MODE_CONSTANT, 0); + reshape_tensors[0] = inputs[0]; - biases = vsi_nn_merge_input_zeropoint_to_bias(graph, inputs[0], inputs[1], inputs[2]); + if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + { + shape[0] = inputs[1]->attr.size[0]; + shape[1] = 1; + for (i = 1; i < inputs[1]->attr.dim_num; i++) + { + shape[1] *= inputs[1]->attr.size[i]; + } + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + inputs[1], (uint32_t*)shape, new_rank ); + } + else + { + reshape_tensors[1] = inputs[1]; + } - temp_tensor[0] = inputs[0]; + if (inputs[2] && inputs[2]->attr.dim_num == 1) + { + shape[0] = inputs[2]->attr.size[0]; + shape[1] = 1; + new_rank = 2; + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + inputs[2], (uint32_t*)shape, new_rank ); + } + + weight_pad_end[0] = gpu_align_np2_safe(reshape_tensors[1]->attr.size[0], 8) - reshape_tensors[1]->attr.size[0]; + + weights = vsi_nn_pad_tensor(graph, reshape_tensors[1], weight_pad_front, weight_pad_end, + reshape_tensors[1]->attr.dim_num, VSI_NN_PAD_MODE_CONSTANT, 0); + + biases = vsi_nn_merge_input_zeropoint_to_bias(graph, reshape_tensors[0], reshape_tensors[1], reshape_tensors[2]); + + temp_tensor[0] = reshape_tensors[0]; temp_tensor[1] = weights; temp_tensor[2] = biases; @@ -760,6 +799,16 @@ static vsi_nn_kernel_node_t _setup } } + if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + { + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + } + + if (inputs[2] && inputs[2]->attr.dim_num == 1) + { + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + } + if (weights) { vsi_nn_ReleaseTensor(&weights); diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c index 4d67bf3..6cba0a0 100644 --- a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c @@ -164,9 +164,9 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) else if ((U8 == input_attr->dtype) || (U8 == input1_attr->dtype)) { uint16_t M0 = 0; - int8_t postShift0 = 0; + int32_t postShift0 = 0; uint16_t M1 = 0; - int8_t postShift1 = 0; + int32_t postShift1 = 0; uint32_t i = 0; gpu_dp_inst_t uniU8SubZptoF32Conv0_4x4 = {{ 0x09090909, // TCfg @@ -188,8 +188,8 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) 0x00010001, 0x00000000, 0x00010001, 0x00000000, 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - vsi_nn_GetFP32MultiAndPostShift(scaleIn0, &M0, &postShift0); - vsi_nn_GetFP32MultiAndPostShift(scaleIn1, &M1, &postShift1); + gpu_quantize_multiplier_16bit(scaleIn0, &M0, &postShift0); + gpu_quantize_multiplier_16bit(scaleIn1, &M1, &postShift1); uniU8SubZptoF32Conv0_4x4.data[7] |= (postShift0 & 0x1F); uniU8SubZptoF32Conv1_4x4.data[7] |= (postShift1 & 0x1F); for ( i = 0; i < 8; i++ ) diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c index 995455c..e6831f7 100644 --- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c @@ -266,9 +266,11 @@ static vx_param_description_t kernel_param_def[] = {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define INPUT_FUNC_TYPE (2) +#define INPUT_SCALAR_ALPHA (3) #define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) /* @@ -296,6 +298,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) float inputTail = 0; float outputScale = 1.0f; float outputZP = 0; + float alpha = 0; uint32_t pack_key; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -303,7 +306,9 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &type); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[INPUT_FUNC_TYPE], &type); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[INPUT_SCALAR_ALPHA], &alpha); CHECK_STATUS_FAIL_GOTO(status, final ); out_shape = attr[1]->shape; @@ -408,6 +413,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "alpha", &alpha ); CHECK_STATUS_FAIL_GOTO(status, final ); } break; @@ -466,6 +473,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) "outputScale", &outputScale ); status |= vsi_nn_kernel_gpu_add_param( node, "outputZP", &outputZP ); + status |= vsi_nn_kernel_gpu_add_param( node, + "alpha", &alpha ); if (attr[1]->dtype == F16) { @@ -555,7 +564,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* rs_tensors[2] = { NULL }; int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; int32_t new_rank = 0; - vsi_bool ret; + vsi_bool ret = FALSE; + float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); ret = vsi_nn_kernel_optimize_element_shape( (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, @@ -586,6 +596,8 @@ static vsi_nn_kernel_node_t _setup rs_tensors, 1, &rs_tensors[1], 1 ); node_params[INPUT_FUNC_TYPE] = vsi_nn_kernel_scalar_create( graph, I32, &unary_type ); + node_params[INPUT_SCALAR_ALPHA] = vsi_nn_kernel_scalar_create( + graph, F32, &alpha ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); @@ -609,6 +621,11 @@ OnError: vsi_nn_kernel_scalar_release( &node_params[INPUT_FUNC_TYPE] ); } + if (node_params[INPUT_SCALAR_ALPHA]) + { + vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_ALPHA] ); + } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c index 12bac55..53fce38 100644 --- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c @@ -181,8 +181,8 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) float dstScale = 0; uint16_t M0 = 0; uint16_t M1 = 0; - int8_t postShift0 = 0; - int8_t postShift1 = 0; + int32_t postShift0 = 0; + int32_t postShift1 = 0; uint32_t pack_key = 0; int32_t ac2zero = 0; @@ -279,8 +279,8 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) dstScale = 1; dstZP = 0.0f; } - vsi_nn_GetFP32MultiAndPostShift(src0Scale / 1.0f, &M0, &postShift0); - vsi_nn_GetFP32MultiAndPostShift(src1Scale / 1.0f, &M1, &postShift1); + gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postShift0); + gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postShift1); mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP); inOutScale = src0Scale * src1Scale / dstScale; diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c index 032d473..4416328 100644 --- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c @@ -588,7 +588,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; int32_t axis_num = 0; - int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", (size_t*)&axis_num); + size_t axis_num_temp = 0; + int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num_temp); int32_t axis_first = axis[0]; int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 1, 1, 1, 1 } }; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; @@ -602,6 +603,8 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_bool is_continue_axis = TRUE; + axis_num = (int32_t)axis_num_temp; + for ( i = 1; i < axis_num; i++) { if ( axis[i] != (axis[i - 1] + 1) && axis[0] == 0) diff --git a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c index db0aea8..603be47 100644 --- a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c @@ -139,7 +139,7 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer) int32_t input_fl = 0; int32_t output_fl = 0; uint16_t M0 = 0; - int8_t postShift = 0; + int32_t postShift = 0; float inputScale = 1.0f; int32_t input_ZP = 0; float outputScale = 1.0f; @@ -193,7 +193,7 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer) if ( ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) && ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) ) { - vsi_nn_GetFP32MultiAndPostShift(inputScale / outputScale, &M0, &postShift); + gpu_quantize_multiplier_16bit(inputScale / outputScale, &M0, &postShift); } image_2d = (vsi_bool)(input_shape->size < 3 || 1 == input_shape->data[2]); diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c index 701d23e..35b25a0 100644 --- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c @@ -196,8 +196,8 @@ DEF_KERNEL_INITIALIZER(_pow_initializer) float dstZP = 0; float dstScale = 1.0f; - int8_t postshift0 = 0; - int8_t postshift1 = 0; + int32_t postshift0 = 0; + int32_t postshift1 = 0; float outScale_fl = 1; uint16_t M0 = 0; @@ -229,7 +229,7 @@ DEF_KERNEL_INITIALIZER(_pow_initializer) src0ZP = attr[0]->asymm.zero_point; src0Scale = attr[0]->asymm.scale; - vsi_nn_GetFP32MultiAndPostShift(src0Scale / 1.0f, &M0, &postshift0); + gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postshift0); } if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) @@ -243,7 +243,7 @@ DEF_KERNEL_INITIALIZER(_pow_initializer) src1ZP = attr[1]->asymm.zero_point; src1Scale = attr[1]->asymm.scale; - vsi_nn_GetFP32MultiAndPostShift(src1Scale / 1.0f, &M1, &postshift1); + gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postshift1); } if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c new file mode 100644 index 0000000..f1166dd --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c @@ -0,0 +1,1344 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "utils/vsi_nn_dtype_util_prv.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + DOWN = 0, + DOWN_2X_SAME, + DOWN_2X_HALF_SAME, + UP, + UP_OPT, + UP_2X_SAME, + UP_2X_HALF_SAME, + UP_4X_SAME, + UP_4X_HALF_SAME, + UP_8X_SAME, + UP_8X_HALF_SAME, +} _internal_scale_e; + +#define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type) "resize_1d_bilinear_"#_input_type +#define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_1d_bilinear_"#_input_type"_opt" +#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_NX() "resize_1d_bilinear_UP_NX" +#define _RESIZE_BILINEAR_KERNEL_SOURCE_DOWN_NX() "resize_1d_bilinear_DOWN_NX" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_1D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (scale_flag)) + +#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_1D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN ), \ + CVIVANTE_NAMESPACE("evis.resize_1d_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_1D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP ), \ + CVIVANTE_NAMESPACE("evis.resize_1d_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_1D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT ), \ + CVIVANTE_NAMESPACE("evis.resize_1d_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP_NX( IN_DTYPE, OUT_DTYPE, MODE_TYPE ) \ + { RESIZE_1D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, MODE_TYPE ), \ + CVIVANTE_NAMESPACE("evis.resize_1d_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_"STR(MODE_TYPE)), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_NX() } + +#define PACK_KERNEL_MAP_DOWN_NX( IN_DTYPE, OUT_DTYPE, MODE_TYPE ) \ + { RESIZE_1D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, MODE_TYPE ), \ + CVIVANTE_NAMESPACE("evis.resize_1d_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_"STR(MODE_TYPE)), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_DOWN_NX() } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_1d_bilinear_kernel_map[] = +{ + PACK_KERNEL_MAP_DOWN(I8, I8), + PACK_KERNEL_MAP_DOWN(I16, I16), + PACK_KERNEL_MAP_DOWN(U8, F16), + PACK_KERNEL_MAP_DOWN(U8, U8), + PACK_KERNEL_MAP_DOWN(F16, F16), + PACK_KERNEL_MAP_DOWN(F16, U8), + PACK_KERNEL_MAP_DOWN(BF16, BF16), + PACK_KERNEL_MAP_UP(I8, I8), + PACK_KERNEL_MAP_UP(I16, I16), + PACK_KERNEL_MAP_UP(U8, U8), + PACK_KERNEL_MAP_UP(F16, F16), + PACK_KERNEL_MAP_UP(BF16, BF16), + PACK_KERNEL_MAP_UP_OPT(U8, U8), + PACK_KERNEL_MAP_UP_NX(U8, U8, UP_2X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(U8, U8, UP_2X_SAME), + PACK_KERNEL_MAP_UP_NX(I8, I8, UP_2X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(I8, I8, UP_2X_SAME), + PACK_KERNEL_MAP_UP_NX(I16, I16, UP_2X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(I16, I16, UP_2X_SAME), + PACK_KERNEL_MAP_UP_NX(F16, F16, UP_2X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(F16, F16, UP_2X_SAME), + PACK_KERNEL_MAP_UP_NX(U8, U8, UP_4X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(U8, U8, UP_4X_SAME), + PACK_KERNEL_MAP_UP_NX(I8, I8, UP_4X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(I8, I8, UP_4X_SAME), + PACK_KERNEL_MAP_UP_NX(I16, I16, UP_4X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(I16, I16, UP_4X_SAME), + PACK_KERNEL_MAP_UP_NX(F16, F16, UP_4X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(F16, F16, UP_4X_SAME), + PACK_KERNEL_MAP_UP_NX(U8, U8, UP_8X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(U8, U8, UP_8X_SAME), + PACK_KERNEL_MAP_UP_NX(I8, I8, UP_8X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(I8, I8, UP_8X_SAME), + PACK_KERNEL_MAP_UP_NX(I16, I16, UP_8X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(I16, I16, UP_8X_SAME), + PACK_KERNEL_MAP_UP_NX(F16, F16, UP_8X_HALF_SAME), + PACK_KERNEL_MAP_UP_NX(F16, F16, UP_8X_SAME), + PACK_KERNEL_MAP_DOWN_NX(U8, U8, DOWN_2X_HALF_SAME), + PACK_KERNEL_MAP_DOWN_NX(U8, U8, DOWN_2X_SAME), + PACK_KERNEL_MAP_DOWN_NX(I8, I8, DOWN_2X_HALF_SAME), + PACK_KERNEL_MAP_DOWN_NX(I8, I8, DOWN_2X_SAME), + PACK_KERNEL_MAP_DOWN_NX(I16, I16, DOWN_2X_HALF_SAME), + PACK_KERNEL_MAP_DOWN_NX(I16, I16, DOWN_2X_SAME), + PACK_KERNEL_MAP_DOWN_NX(F16, F16, DOWN_2X_HALF_SAME), + PACK_KERNEL_MAP_DOWN_NX(F16, F16, DOWN_2X_SAME), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _resize_1d_bilinear_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_1D_BILINEAR_PARAM_NUM _cnt_of_array( _resize_1d_bilinear_kernel_param_def ) +#define _RESIZE_NO_SCALE_PARAM_NUM 4 +#define _RESIZE_1D_NX_KERENL_PARAM_NUM 3 + +#define SCALAR_ALIGN_CORNERS (2) +#define SCALAR_HALF_PIXEL (3) +#define SCALAR_TENSOR_SCALE (4) +#define SCALAR_SCALE_TYPE (2) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_int_array_t * out_shape = NULL; + vsi_int_array_t * in_shape = NULL; + vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; + int32_t align_corners = 0; + int32_t half_pixel_centers = 0; + + uint32_t depth = 0; + int32_t srcFixPointPos = 0; + int32_t dstFixPointPos = 0; + float input_scale = 1.0; + int32_t inputZP = 0; + float output_scale = 1.0; + int32_t outputZP = 0; + float scale_factor = 1.0f; + uint32_t in_width = 1; + uint32_t out_width = 1; + uint32_t out_height = 1; + float half_pixel_value = 0.0f; + vsi_bool is_use_scale_kernel = (vsi_bool)(_RESIZE_1D_BILINEAR_PARAM_NUM == param_size); + _internal_scale_e scale_flag = DOWN; + vsi_bool is_run_nx_kernel = (vsi_bool)(_RESIZE_1D_NX_KERENL_PARAM_NUM == param_size); + int32_t scale_type_value = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + if (is_run_nx_kernel) + { + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &scale_type_value); + CHECK_STATUS_FAIL_GOTO(status, final ); + scale_flag = (_internal_scale_e)scale_type_value; + } + else + { + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &align_corners); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &half_pixel_centers); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + out_shape = output_attr->shape; + in_shape = input_attr->shape; + input_dtype = input_attr->dtype; + output_dtype = output_attr->dtype; + + in_width = in_shape->data[0]; + depth = in_shape->data[2]; + out_width = out_shape->data[0]; + out_height = out_shape->data[1]; + + if (align_corners && out_width > 1) + { + scale_factor = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1); + } + else + { + scale_factor = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) + { + input_scale = input_attr->asymm.scale; + inputZP = input_attr->asymm.zero_point; + } + else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + { + srcFixPointPos = input_attr->dfp.fl; + if (srcFixPointPos >= 0) + { + input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); + } + else if (srcFixPointPos < 0) + { + input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos); + } + inputZP = 0; + } + else + { + input_scale = 1.0f; + inputZP = 0; + } + + if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) + { + output_scale = output_attr->asymm.scale; + outputZP = output_attr->asymm.zero_point; + } + else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) + { + dstFixPointPos = output_attr->dfp.fl; + if (dstFixPointPos >= 0) + { + output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos); + } + else if (dstFixPointPos < 0) + { + output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos); + } + outputZP = 0; + } + else + { + output_scale = 1.0; + outputZP = 0; + } + + if (is_run_nx_kernel) + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + } + else + { + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + if (is_run_nx_kernel) + { + gpu_dp_inst_t uniResize2xUp_half_2x8 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x32212110, 0x54434332, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3a003400, 0x34003a00, 0x3a003400, 0x34003a00, + 0x3a003400, 0x34003a00, 0x3a003400, 0x34003a00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize2xUp_2x8 = {{ + 0x51515151, // TCfg + 0x00000000, // ASelt + 0x21011000, 0x43033202, // ABin + 0xa2a2a2a2, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x38003800, 0x00003c00, 0x38003800, + 0x00003c00, 0x38003800, 0x00003c00, 0x38003800 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize4xUp_half_2x8 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x21211010, 0x32322121, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x39003600, 0x3b003000, 0x30003b00, 0x36003900, + 0x39003600, 0x3b003000, 0x30003b00, 0x36003900 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize4xUp_2x8 = {{ + 0x55515551, // TCfg + 0x00000000, // ASelt + 0x10101000, 0x21212101, // ABin + 0xaaa2aaa2, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x34003a00, 0x38003800, 0x3a003400, + 0x00003c00, 0x34003a00, 0x38003800, 0x3a003400 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_half_2x8 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x10101010, 0x21212121, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x38803700, 0x39803500, 0x3a803200, 0x3b802c00, + 0x2c003b80, 0x32003a80, 0x35003980, 0x37003880 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_2x8 = {{ + 0x55555551, // TCfg + 0x00000000, // ASelt + 0x10101000, 0x10101010, // ABin + 0xaaaaaaa2, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x30003b00, 0x34003a00, 0x36003900, + 0x38003800, 0x39003600, 0x3a003400, 0x3b003000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize2xDown_8bit_half_2x8 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x38003800, 0x38003800, 0x38003800, 0x38003800, + 0x38003800, 0x38003800, 0x38003800, 0x38003800 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize2xDown_8bit_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x06040200, 0x0e0c0a08, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize2xDown_16bit_half_2x8 = {{ + 0x55555555, // TCfg + 0x55550000, // ASelt + 0x76543210, 0x76543210, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x38003800, 0x38003800, 0x38003800, 0x38003800, + 0x38003800, 0x38003800, 0x38003800, 0x38003800 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize2xDown_16bit_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); + if ( UP_2X_HALF_SAME == scale_flag ) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniResizeNxUp_2x8", &uniResize2xUp_half_2x8); + } + else if ( UP_2X_SAME == scale_flag ) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniResizeNxUp_2x8", &uniResize2xUp_2x8); + } + else if ( UP_4X_HALF_SAME == scale_flag ) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniResizeNxUp_2x8", &uniResize4xUp_half_2x8); + } + else if ( UP_4X_SAME == scale_flag ) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniResizeNxUp_2x8", &uniResize4xUp_2x8); + } + else if ( UP_8X_HALF_SAME == scale_flag ) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniResizeNxUp_2x8", &uniResize8xUp_half_2x8); + } + else if ( UP_8X_SAME == scale_flag ) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniResizeNxUp_2x8", &uniResize8xUp_2x8); + } + else if ( DOWN_2X_HALF_SAME == scale_flag ) + { + if (I8 == input_dtype || U8 == input_dtype) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniResizeNxDown_2x8", &uniResize2xDown_8bit_half_2x8); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniResizeNxDown_2x8", &uniResize2xDown_16bit_half_2x8); + } + } + else if ( DOWN_2X_SAME == scale_flag ) + { + if (I8 == input_dtype || U8 == input_dtype) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniResizeNxDown_2x8", &uniResize2xDown_8bit_2x8); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniResizeNxDown_2x8", &uniResize2xDown_16bit_2x8); + } + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + { + float dfpScale = input_scale * output_scale; + gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtact8Bit_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + vx_uint32 uniConvertDFP2FP32_left_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }; + vx_uint32 uniConvertDFP2FP32_right_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00030001, 0x00070005, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }; + if (I8 == input_dtype && I8 == output_dtype && out_width > in_width) + { + gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetMaskShift_2x8 = {{ + 0x99999999, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x55555555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00150004, 0x00370026, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_4x4", &uniConvertDFP2FP32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4", + &uniConvertDFP2FP32_part1_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (I16 == input_dtype && I16 == output_dtype && out_width > in_width) + { + gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetMaskShift_2x8 = {{ + 0x99999999, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x55555555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00150004, 0x00370026, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_4x4", &uniConvertDFP2FP32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4", + &uniConvertDFP2FP32_part1_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_left_4x4", + &uniConvertDFP2FP32_left_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_right_4x4", + &uniConvertDFP2FP32_right_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "scale_x", &scale_factor); + status |= vsi_nn_kernel_gpu_add_param( node, "dfpScale", &dfpScale); + status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final ); + gpu_param.global_scale[1] = out_height; + } + else if (U8 == input_dtype && (U8 == output_dtype || F16 == output_dtype)) + { + float uint8Scale = input_scale / output_scale; + float uint8ZP_out = (float)outputZP; + gpu_dp_inst_t uniExtact8Bit_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8SubZPtoFp32_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + vx_uint32 uniU8SubZPtoFp32_left_4x4[16] = { + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00020000, 0x00060004, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }; + vx_uint32 uniU8SubZPtoFp32_right_4x4[16] = { + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00030001, 0x00070005, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }; + + if (F16 == output_dtype) + { + status = vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_right_4x4", &uniU8SubZPtoFp32_right_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + if (out_width > in_width) + { + gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetMaskShift_2x8 = {{ + 0x99999999, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x55555555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8SubZPtoFp32_part1_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00150004, 0x00370026, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + vx_uint32 uniBilinear_4x4[16] = { + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00510040, 0x00730062, // ABin + 0x05050505, // BSelt + 0x00320010, 0x00760054, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); + if (is_use_scale_kernel) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_4x4", &uniBilinear_4x4); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_4x4", &uniU8SubZPtoFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8SubZPtoFp32_part1_4x4", &uniU8SubZPtoFp32_part1_4x4); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + status = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", + &uniU8SubZPtoFp32_left_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_right_4x4", + &uniU8SubZPtoFp32_right_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if (!is_use_scale_kernel) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &uint8Scale); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &uint8ZP_out); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + status = vsi_nn_kernel_gpu_add_param( node, "scale_x", &scale_factor); + if (!is_use_scale_kernel) + { + status |= vsi_nn_kernel_gpu_add_param( node, "input_ZP", &inputZP); + } + status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final ); + gpu_param.global_scale[1] = out_height; + + } + else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype)) + { + float uint8Scale = 1.0f / output_scale; + float uint8ZP_out = (vx_float32)outputZP; + gpu_dp_inst_t uniExtact8Bit_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniFp16toFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniRightSubLeft_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtactHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + vx_uint32 uniConvertFp2FP32_left_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }; + vx_uint32 uniConvertFp2FP32_right_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00030001, 0x00070005, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }; + if (F16 == input_dtype && F16 == output_dtype && out_width > in_width) + { + gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetMaskShift_2x8 = {{ + 0x99999999, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x55555555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniFp16toFp32_part1_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00150004, 0x00370026, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (F16 == output_dtype) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertFp2FP32_left_4x4", + &uniConvertFp2FP32_left_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertFp2FP32_right_4x4", + &uniConvertFp2FP32_right_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertFp2FP32_left_4x4", + &uniConvertFp2FP32_left_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertFp2FP32_right_4x4", + &uniConvertFp2FP32_right_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &uint8Scale); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &uint8ZP_out); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + status = vsi_nn_kernel_gpu_add_param( node, "scale_x", &scale_factor); + status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final ); + gpu_param.global_scale[1] = out_height; + } + else if (BF16 == input_dtype && BF16 == output_dtype) + { + if (out_width > in_width) + { + gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetMaskShift_2x8 = {{ + 0x99999999, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x55555555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + gpu_dp_inst_t uniConvBF16toF32_odd_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x02050004, 0x06070406, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_even_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x03050104, 0x07070506, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvBF16toF32_odd_2x8", &uniConvBF16toF32_odd_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvBF16toF32_even_2x8", &uniConvBF16toF32_even_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + status = vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); + status |= vsi_nn_kernel_gpu_add_param( node, "scale_x", &scale_factor); + CHECK_STATUS_FAIL_GOTO(status, final ); + gpu_param.global_scale[1] = out_height; + } + else + { + VSILOGE("input or output's format is not support"); + status = VSI_FAILURE; + goto final; + } + + if (!is_run_nx_kernel) + { + status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if (is_run_nx_kernel) + { + gpu_param.global_size[0] = gpu_align_p2((out_width + \ + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = depth; + gpu_param.dim = 2; + } + else + { + gpu_param.global_size[0] = gpu_align_p2((out_width + \ + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]; + gpu_param.global_size[2] = depth / gpu_param.global_scale[2]; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _resize_1d_bilinear_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool is_same_type, + int32_t align_corners , + int32_t half_pixel_centers, + vsi_bool *is_run_opt_kernel, + vsi_bool *is_run_nx_kernel, + int32_t *scale_flag_value + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _resize_1d_bilinear_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_1d_bilinear_kernel_map ); + vx_param_description_t * param_def = _resize_1d_bilinear_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_1d_bilinear_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_1d_bilinear_initializer; + uint32_t key = 0; + uint32_t i = 0; + _internal_scale_e scale_flag = UP; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0]) + { + scale_flag = UP; + + if (is_same_type) + { + scale_flag = UP_OPT; + } + + if (2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) + { + if (is_same_type && (!align_corners) && (half_pixel_centers)) + { + scale_flag = UP_2X_HALF_SAME; + } + else if (is_same_type && (!align_corners) && (!half_pixel_centers)) + { + scale_flag = UP_2X_SAME; + } + } + else if (4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) + { + if (is_same_type && (!align_corners) && (half_pixel_centers)) + { + scale_flag = UP_4X_HALF_SAME; + } + else if (is_same_type && (!align_corners) && (!half_pixel_centers)) + { + scale_flag = UP_4X_SAME; + } + } + else if (8 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) + { + if (is_same_type && (!align_corners) && (half_pixel_centers)) + { + scale_flag = UP_8X_HALF_SAME; + } + else if (is_same_type && (!align_corners) && (!half_pixel_centers)) + { + scale_flag = UP_8X_SAME; + } + } + } + else + { + scale_flag = DOWN; + if (inputs[0]->attr.size[0] == 2 * outputs[0]->attr.size[0]) + { + if (is_same_type && (!align_corners) && (half_pixel_centers)) + { + scale_flag = DOWN_2X_HALF_SAME; + } + else if (is_same_type && (!align_corners) && (!half_pixel_centers)) + { + scale_flag = DOWN_2X_SAME; + } + } + } + + key = RESIZE_1D_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + + if ((scale_flag > UP_OPT) && (i >= kernel_map_size) && is_same_type) + { + scale_flag = UP_OPT; + key = RESIZE_1D_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + } + + if ((UP_OPT == scale_flag) && (i >= kernel_map_size)) + { + scale_flag = UP; + key = RESIZE_1D_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + } + + if ((scale_flag <= UP) && (scale_flag > DOWN) && (i >= kernel_map_size)) + { + scale_flag = DOWN; + key = RESIZE_1D_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + } + + if ( i < kernel_map_size ) + { + if ((scale_flag > UP_OPT) || ((scale_flag > DOWN) && (scale_flag < UP))) + { + param_def_size = _RESIZE_1D_NX_KERENL_PARAM_NUM; + *is_run_nx_kernel = TRUE; + } + else if (UP_OPT == scale_flag) + { + param_def_size = _RESIZE_1D_BILINEAR_PARAM_NUM; + *is_run_opt_kernel = TRUE; + } + else + { + param_def_size = _RESIZE_NO_SCALE_PARAM_NUM; + *is_run_opt_kernel = FALSE; + } + *scale_flag_value = (int32_t)scale_flag; + + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_tensor_t* _create_scale_tensor + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_t *input, + vsi_nn_tensor_t *output, + int32_t align_corners, + int32_t half_pixel_centers + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* scale = NULL; + uint32_t dims = output->attr.dim_num; + uint32_t batch = dims > 3 ? output->attr.size[3] : 1; + uint32_t width = output->attr.size[0]; + uint32_t sizes[4] = {width * 2, 1, 1, batch}; + uint32_t item_count = width * 2 * batch; + uint32_t input_width = input->attr.size[0]; + uint32_t x = 0; + uint32_t b = 0; + float width_scale = 1.0f; + uint16_t *scale_data_ptr = NULL; + + if (align_corners && width > 1) + { + width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(width - 1); + } + else + { + width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)width; + } + + scale_data_ptr = (uint16_t *)malloc(item_count * sizeof(uint16_t)); + if (scale_data_ptr == NULL) + { + VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__); + goto OnError; + } + memset(scale_data_ptr, 0, item_count * sizeof(vx_uint16)); + for (b = 0; b < batch; b ++) + { + for (x = 0; x < width; x ++) + { + float input_w = 0.0f; + int32_t w0 = 0; + uint32_t idx = b * width * 2 + x * 2; + float tl = 0.0f; + float tr = 0.0f; + if (half_pixel_centers) + { + input_w = ((vx_float32)x + 0.5f) * width_scale - 0.5f; + } + else + { + input_w = x * width_scale; + } + w0 = (vx_int32)input_w; + tl = (1 - (input_w - w0)); + tr = (input_w - w0); + + scale_data_ptr[idx + 0] = fp32_to_fp16(tl); + scale_data_ptr[idx + 1] = fp32_to_fp16(tr); + } + } + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + attr.size[0] = sizes[0]; + attr.size[1] = sizes[1]; + attr.size[2] = sizes[2]; + attr.size[3] = sizes[3]; + attr.dim_num = (batch == 1) ? 2 : 4; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.vtl = FALSE; + + scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr); + if (scale_data_ptr) + { + free(scale_data_ptr); + scale_data_ptr = NULL; + } + +OnError: + return scale; +} + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_1D_BILINEAR_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]); + vsi_bool is_run_opt_kernel = FALSE; + vsi_bool is_run_nx_kernel = FALSE; + vsi_nn_tensor_t* scale = NULL; + int32_t scale_flag_value = 0; + + status = _query_kernel( kernel, inputs, outputs, is_same_type, + align_corners, half_pixel_centers, + &is_run_opt_kernel, &is_run_nx_kernel, + &scale_flag_value); + + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + size_t node_params_num = _RESIZE_NO_SCALE_PARAM_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_1D_BILINEAR_PARAM_NUM, + inputs, input_num, outputs, output_num ); + + if (!is_run_nx_kernel) + { + node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + }else + { + node_params[SCALAR_SCALE_TYPE] = vsi_nn_kernel_scalar_create( graph, I32, &scale_flag_value ); + node_params_num = _RESIZE_1D_NX_KERENL_PARAM_NUM; + } + + + if (is_run_opt_kernel) + { + scale = _create_scale_tensor(graph, inputs[0], outputs[0], align_corners, half_pixel_centers); + node_params[SCALAR_TENSOR_SCALE] = (vsi_nn_kernel_node_param_t)(scale->t); + node_params_num = _RESIZE_1D_BILINEAR_PARAM_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + if (!is_run_nx_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + } + else + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_TYPE] ); + } + + if (is_run_opt_kernel) + { + if (scale) + { + vsi_nn_ReleaseTensor(&scale); + } + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( resize_1d_bilinear, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c new file mode 100644 index 0000000..9fc49ee --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c @@ -0,0 +1,533 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + LARGE = 0, + SMALL +} _internal_nearest_e; + +#define _RESIZE_1D_NEAREST_KERNEL_SOURCE "resize_1d_nearest" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_1D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE, mode ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (mode)) + + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_1D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE, LARGE ), \ + CVIVANTE_NAMESPACE("evis.resize_1d_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _RESIZE_1D_NEAREST_KERNEL_SOURCE } + +#define PACK_KERNEL_MAP_OPT( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_1D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE, SMALL ), \ + CVIVANTE_NAMESPACE("evis.resize_1d_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_op"), \ + _RESIZE_1D_NEAREST_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_1d_nearest_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP(F16, F16), + PACK_KERNEL_MAP(I16, I16), + PACK_KERNEL_MAP(I8, I8), + PACK_KERNEL_MAP(U8, U8), + PACK_KERNEL_MAP_OPT(F16, F16), + PACK_KERNEL_MAP_OPT(I16, I16), + PACK_KERNEL_MAP_OPT(I8, I8), + PACK_KERNEL_MAP_OPT(U8, U8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _resize_1d_nearest_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_1D_NEAREST_PARAM_NUM _cnt_of_array( _resize_1d_nearest_kernel_param_def ) + +#define SCALAR_ALIGN_CORNERS (2) +#define SCALAR_HALF_PIXEL (3) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ +#define MAX_POST_SHIFT_BITS (31) +#define MAX_MULTIPLIER_NUM (65535) + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_int_array_t * out_shape = NULL; + vsi_int_array_t * in_shape = NULL; + vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; + int32_t align_corners = 0; + int32_t half_pixel_centers = 0; + uint32_t depth = 0; + int32_t srcFixPointPos = 0; + int32_t dstFixPointPos = 0; + float input_scale = 1.0; + int32_t inputZP = 0; + float output_scale = 1.0; + int32_t outputZP = 0; + float scale_factor = 1.0f; + uint32_t in_width = 0; + uint32_t out_width = 0; + uint32_t out_height = 0; + float half_pixel_value = 0.0f; + float round_value = 0.0f; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &align_corners); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &half_pixel_centers); + CHECK_STATUS_FAIL_GOTO(status, final ); + out_shape = output_attr->shape; + in_shape = input_attr->shape; + input_dtype = input_attr->dtype; + output_dtype = output_attr->dtype; + + in_width = in_shape->data[0]; + depth = in_shape->data[2]; + out_width = out_shape->data[0]; + out_height = out_shape->data[1]; + + if (BF16 == input_dtype && output_dtype == BF16) + { + input_dtype = F16; + output_dtype = F16; + } + if (align_corners && out_width > 1) + { + scale_factor = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1); + } + else + { + scale_factor = ((float)in_width * 1.0f) / (float)out_width; + } + + + if (align_corners) + { + round_value = 0.5f; + } + else + { + round_value = 0.0f; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) + { + input_scale = input_attr->asymm.scale; + inputZP = input_attr->asymm.zero_point; + } + else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + { + srcFixPointPos = input_attr->dfp.fl; + if (srcFixPointPos >= 0) + { + input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos); + } + else if (srcFixPointPos < 0) + { + input_scale = (float)((int64_t)1 << -srcFixPointPos); + } + inputZP = 0; + } + else + { + input_scale = 1.0f; + inputZP = 0; + } + + if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) + { + output_scale = 1.0f / output_attr->asymm.scale; + outputZP = output_attr->asymm.zero_point; + } + else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) + { + dstFixPointPos = output_attr->dfp.fl; + if (dstFixPointPos >= 0) + { + output_scale = (float) ((int64_t)1 << dstFixPointPos); + } + else if (dstFixPointPos < 0) + { + output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos); + } + outputZP = 0; + } + else + { + output_scale = 1.0; + outputZP = 0; + } + + if (F16 == input_dtype && F16 == output_dtype) + { + gpu_dp_inst_t uniGetExtractData_2x8 = {{ + 0x00009999, // TCfg + 0x00000000, // ASelt + 0x06040200, 0x00000000, // ABin + 0x0000aaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00100010, 0x00100010, 0x00100010, 0x00100010, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if (scale_factor < 4.0f) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniGetExtractData_2x8", &uniGetExtractData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + status = vsi_nn_kernel_gpu_add_param( node, "scale_x", &scale_factor); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if ( input_dtype == output_dtype && (I8 == input_dtype || I16 == input_dtype)) + { + gpu_dp_inst_t uniGetExtractData_2x8 = {{ + 0x00009999, // TCfg + 0x00000000, // ASelt + 0x06040200, 0x00000000, // ABin + 0x0000aaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00080008, 0x00080008, 0x00080008, 0x00080008, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertI8toI8_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + if (I16 == input_dtype) + { + uniGetExtractData_2x8.data[8] = 0x00100010; + uniGetExtractData_2x8.data[9] = 0x00100010; + uniGetExtractData_2x8.data[10] = 0x00100010; + uniGetExtractData_2x8.data[11] = 0x00100010; + uniGetExtractData_2x8.data[12] = 0x00100010; + uniGetExtractData_2x8.data[13] = 0x00100010; + uniGetExtractData_2x8.data[14] = 0x00100010; + uniGetExtractData_2x8.data[15] = 0x00100010; + } + + if (srcFixPointPos > dstFixPointPos) + { + int32_t postshift = vsi_nn_min(srcFixPointPos - dstFixPointPos, MAX_POST_SHIFT_BITS); + + uniConvertI8toI8_2x8.data[7] |= (postshift & 0x1F); + } + else + { + uint32_t multiplier = vsi_nn_min((int64_t)1 << (dstFixPointPos - srcFixPointPos), MAX_MULTIPLIER_NUM); + uint32_t i = 0; + + for (i = 0; i < 8; i++) + { + uniConvertI8toI8_2x8.data[i + 8] = multiplier; + } + } + + if (scale_factor < 4.0f) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniGetExtractData_2x8", &uniGetExtractData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + status = vsi_nn_kernel_gpu_add_param( node, "scale_x", &scale_factor); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertI8toI8_2x8", &uniConvertI8toI8_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (U8 == input_dtype && U8 == output_dtype) + { + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP[2] = {0}; + gpu_dp_inst_t uniMultiplyAndPostShift_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetExtractData_2x8 = {{ + 0x00009999, // TCfg + 0x00000000, // ASelt + 0x06040200, 0x00000000, // ABin + 0x0000aaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00080008, 0x00080008, 0x00080008, 0x00080008, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_quantize_multiplier_16bit(input_scale * output_scale, &M0, &postShift); + + multAndoutZP[0] = (uint32_t)(M0); + multAndoutZP[1] = (uint32_t)((outputZP << postShift) - inputZP * M0); + + uniMultiplyAndPostShift_2x8.data[7] |= (postShift & 0x1F); + + if (scale_factor < 4.0f) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniGetExtractData_2x8", &uniGetExtractData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + + status = vsi_nn_kernel_gpu_add_param( node, "scale_x", &scale_factor); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP); + status |= vsi_nn_kernel_gpu_add_param( node, "uniMultiplyAndPostShift_2x8", &uniMultiplyAndPostShift_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value); + status |= vsi_nn_kernel_gpu_add_param( node, "round_value", &round_value); + CHECK_STATUS_FAIL_GOTO(status, final ); + + gpu_param.global_size[0] = gpu_align_p2((out_width + gpu_param.global_scale[0] - 1)\ + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]; + gpu_param.global_size[2] = depth; + + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +#undef MAX_MULTIPLIER_NUM +#undef MAX_POST_SHIFT_BITS +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _resize_nearest_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t align_corners + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype = F16; + vsi_nn_kernel_dtype_e out_dtype = F16; + const _kernel_map_type * kernel_map = _resize_1d_nearest_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_1d_nearest_kernel_map ); + vx_param_description_t * param_def = _resize_1d_nearest_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_1d_nearest_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_1d_nearest_initializer; + + uint32_t key = 0; + uint32_t i = 0; + uint32_t inputWidth = inputs[0]->attr.size[0]; + uint32_t outputWidth = outputs[0]->attr.size[0]; + float scale_factor; + _internal_nearest_e resize_mode = LARGE; + + if (align_corners && outputWidth > 1) + { + scale_factor = (vx_float32)(inputWidth - 1) / (vx_float32)(outputWidth - 1); + } + else + { + scale_factor = (vx_float32)inputWidth / (vx_float32)outputWidth; + } + + if (scale_factor < 4.0f) + { + resize_mode = SMALL; + } + else + { + resize_mode = LARGE; + } + + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (BF16 == in_dtype && BF16 == out_dtype) + { + in_dtype = F16; + out_dtype = F16; + } + + key = RESIZE_1D_NEAREST_HASH_KEY( in_dtype, out_dtype, resize_mode ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_1D_NEAREST_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + + status = _query_kernel( kernel, inputs, outputs, align_corners ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_1D_NEAREST_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_1D_NEAREST_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( resize_1d_nearest, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c index 52fb9d4..0cc7c61 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -850,7 +850,7 @@ static vsi_status _query_kernel } } - if ((UP_2X_HALF == scale_flag) && (i >= kernel_map_size)) + if ((UP_2X_HALF == scale_flag) && (i >= kernel_map_size) && is_same_type && is_evis2) { scale_flag = UP_OPT; key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); diff --git a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c index 9d17244..9b485a0 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c @@ -348,7 +348,7 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer) else if (U8 == input_dtype && U8 == output_dtype) { uint16_t M0 = 0; - vx_int8 postShift = 0; + int32_t postShift = 0; uint32_t multAndoutZP[2] = {0}; gpu_dp_inst_t uniMultiplyAndPostShift_2x8 = {{ 0xdddddddd, // TCfg @@ -371,7 +371,7 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; - vsi_nn_GetFP32MultiAndPostShift(input_scale * output_scale, &M0, &postShift); + gpu_quantize_multiplier_16bit(input_scale * output_scale, &M0, &postShift); multAndoutZP[0] = (uint32_t)(M0); multAndoutZP[1] = (uint32_t)((outputZP << postShift) - inputZP * M0); diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c index 9e95f5d..e0975d3 100644 --- a/src/tim/vx/internal/src/kernel/evis/select_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c @@ -131,9 +131,9 @@ DEF_KERNEL_INITIALIZER(_select_initializer) float outputScale = 1.0f; int32_t outputZP = 0; uint16_t in0_M0 = 0; - int8_t in0_postShift = 0; + int32_t in0_postShift = 0; uint16_t in1_M0 = 0; - int8_t in1_postShift = 0; + int32_t in1_postShift = 0; uint32_t pack_key = 0; input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -196,8 +196,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer) outputZP = output_attr->asymm.zero_point; } - vsi_nn_GetFP32MultiAndPostShift(input0Scale / outputScale, &in0_M0, &in0_postShift); - vsi_nn_GetFP32MultiAndPostShift(input1Scale / outputScale, &in1_M0, &in1_postShift); + gpu_quantize_multiplier_16bit(input0Scale / outputScale, &in0_M0, &in0_postShift); + gpu_quantize_multiplier_16bit(input1Scale / outputScale, &in1_M0, &in1_postShift); pack_key = _PACK_SELECT_KEY( input0_attr->dtype, input1_attr->dtype, output_attr->dtype ); diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c index 7fa9215..a076329 100644 --- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c @@ -309,7 +309,7 @@ DEF_KERNEL_INITIALIZER(_tile_initializer) { float uint8Scale = scaleIn / scaleOut; uint16_t M0 = 0; - int8_t postShift = 0; + int32_t postShift = 0; uint32_t multAndoutZP[2] = {0}; gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{ @@ -323,7 +323,7 @@ DEF_KERNEL_INITIALIZER(_tile_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; - vsi_nn_GetFP32MultiAndPostShift(uint8Scale, &M0, &postShift); + gpu_quantize_multiplier_16bit(uint8Scale, &M0, &postShift); multAndoutZP[0] = (uint32_t)(M0); multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * M0); diff --git a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c index 1f96b93..df549fd 100644 --- a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c @@ -154,7 +154,7 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer) int32_t input_fl = 0; int32_t output_fl = 0; uint16_t M0 = 0; - int8_t postShift = 0; + int32_t postShift = 0; float inputScale = 1.0f; int32_t input_ZP = 0; float outputScale = 1.0f; @@ -212,7 +212,7 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer) factorOut = 1.0f / outputScale; - vsi_nn_GetFP32MultiAndPostShift(inputScale / outputScale, &M0, &postShift); + gpu_quantize_multiplier_16bit(inputScale / outputScale, &M0, &postShift); image_2d = (vsi_bool)(input_shape->size < 3 || 1 == input_shape->data[2]); diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_gpu.c b/src/tim/vx/internal/src/kernel/vsi_nn_gpu.c index 34d4408..9856fca 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_gpu.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_gpu.c @@ -84,7 +84,7 @@ void gpu_quantize_multiplier_32bit double q; int64_t q_fixed; const int32_t bit = 32; - if( vsi_abs(double_multiplier - 0.0) < 1e-5 ) + if( vsi_abs(double_multiplier - 0.0) < 1e-8 ) { *quantize_multiplier = 0; *shift = bit - 0; @@ -116,6 +116,25 @@ void gpu_quantize_multiplier_32bit } } /* gpu_quantize_multiplier_32_bit() */ +void _modify_multiplier_postshift + ( + uint16_t * quantize_multiplier, + int32_t * shift + ) +{ + uint16_t multiplier = *quantize_multiplier; + int32_t postshift = *shift; + + while (postshift > GPU_MAX_POST_SHIFT_BITS) + { + multiplier = (multiplier + 1) >> 1; + postshift --; + } + + *quantize_multiplier = multiplier; + *shift = postshift; +} + void gpu_quantize_multiplier_16bit ( double double_multiplier, @@ -135,5 +154,8 @@ void gpu_quantize_multiplier_16bit { *shift -= bit; } + + _modify_multiplier_postshift(quantize_multiplier, shift); + } /* gpu_quantize_multiplier_16bit() */ diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c index af3b91b..e0c43e2 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -1145,10 +1145,10 @@ vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_SCALE, &(attr->asymm.scale), sizeof(float)); CHECK_STATUS( status ); - // Reset scale to 1 - if( (attr->asymm.scale - 0.f) < 1e-5 ) + // Reset scale to 1e-8 + if( (attr->asymm.scale - 0.f) < 1e-8 ) { - attr->asymm.scale = 1.0f; + attr->asymm.scale = (float)1e-8; attr->asymm.zero_point = 0; } } @@ -1225,12 +1225,16 @@ vsi_status vsi_nn_kernel_pirority_set static vsi_bool _check_shader_support(vsi_nn_graph_t* graph) { char *envctrl; - int32_t enableShader = 1; + static int32_t enableShader = -1; - envctrl = getenv("VIV_VX_ENABLE_SHADER"); - if (envctrl) + if (enableShader == -1) { - enableShader = atoi(envctrl); + enableShader = 1; + envctrl = getenv("VIV_VX_ENABLE_SHADER"); + if (envctrl) + { + enableShader = atoi(envctrl); + } } #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT @@ -1240,7 +1244,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph) } #endif - if(enableShader == 1) + if (enableShader >= 1) { return TRUE; } diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c index 7be998c..53597b9 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c @@ -459,7 +459,7 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape k = 0; for (j = 0; j < (size_t)input_num; j++) { - if (size_in[k] > 1) + if (size_in[j] > 1) { k = j; break; diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c index 79b3468..9ea24e5 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c @@ -603,10 +603,14 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias vsi_nn_tensor_t * bias ) { - vsi_nn_tensor_t * new_bias = NULL; + vsi_nn_tensor_t * new_bias = NULL; vsi_nn_tensor_attr_t attr; - + int32_t *new_bias_data_ptr = NULL; + uint8_t *weight_data = NULL; + int32_t *bias_data = NULL; + uint32_t i, j; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + weight_data = vsi_nn_ConvertTensorToData(graph, weight); if (bias == NULL) { @@ -620,26 +624,47 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias attr.dtype.zero_point = 0; attr.dtype.vx_type = VSI_NN_TYPE_INT32; } - else - { - VSILOGE("need to add ..."); - } } else { memcpy(&attr, &bias->attr, sizeof(vsi_nn_tensor_attr_t)); + if (attr.dim_num == 1) + { + attr.size[1] = 1; + attr.dim_num = 2; + } + bias_data = (int32_t *)vsi_nn_ConvertTensorToData(graph, bias); } - new_bias = vsi_nn_CreateTensorWithDefault(graph, &attr, 0.0); + new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t)); + memset((void *)new_bias_data_ptr, 0, sizeof(int32_t) * attr.size[0]); - if (input->attr.dtype.zero_point == 0) + if (input->attr.dtype.zero_point != 0) { - return new_bias; + for (i = 0; i < weight->attr.size[1]; i++) + { + uint8_t *weight_ptr = weight_data + i * weight->attr.size[0]; + for (j = 0; j < weight->attr.size[0]; j++) + { + new_bias_data_ptr[i] += -((int32_t)weight_ptr[j] - weight->attr.dtype.zero_point) \ + * input->attr.dtype.zero_point; + } + } } - else + + if (bias_data != NULL) { - VSILOGE("need to process bias - (input_zp * (w - w_zp)) ..."); + for (i = 0; i < weight->attr.size[1]; i++) + { + new_bias_data_ptr[i] += bias_data[i]; + } } + new_bias = vsi_nn_CreateTensorFromData(graph, (uint8_t *)new_bias_data_ptr, &attr); + + vsi_nn_safe_free( new_bias_data_ptr ); + vsi_nn_safe_free( bias_data ); + vsi_nn_safe_free( weight_data ); + return new_bias; } diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c index 235e5ac..bb0d060 100644 --- a/src/tim/vx/internal/src/kernel/vx/convolutional.c +++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c @@ -31,6 +31,8 @@ #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_node.h" #include "vsi_nn_feature.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_graph_optimization.h" static vsi_bool _build_vx_conv2d_param ( @@ -173,6 +175,7 @@ static vx_tensor _expand_tensor_dim vsi_nn_kernel_t * kernel \ ) + REGISTER_CONV_OPENVX_KERNEL( conv1d ) { vx_node node = NULL; @@ -196,10 +199,34 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); + if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + { + temp_tensors[1] = _expand_tensor_dim( inputs[1]->t, + (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); + CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final ); + } + else + { + uint8_t * data = NULL; + vsi_nn_tensor_attr_t attr; + uint32_t i; - temp_tensors[1] = _expand_tensor_dim( inputs[1]->t, - (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); - CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final ); + data = vsi_nn_ConvertTensorToData( graph, inputs[1] ); + CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final ); + + memcpy(&attr, &inputs[1]->attr, sizeof(vsi_nn_tensor_attr_t)); + + attr.size[0] = 1; + for (i = 1; i <= inputs[1]->attr.dim_num; i++) + { + attr.size[i] = inputs[1]->attr.size[i - 1]; + } + attr.dim_num = inputs[1]->attr.dim_num + 1; + attr.dtype.channel_dim = inputs[1]->attr.dtype.channel_dim + 1; + + temp_tensors[1] = vsi_nn_CreateRawTensorFromData(graph, data, &attr); + vsi_nn_safe_free( data ); + } temp_tensors[2] = _expand_tensor_dim( outputs[0]->t, (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); @@ -248,9 +275,38 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); - temp_tensors[1] = _expand_tensor_dim( inputs[1]->t, - (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); - CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final ); + if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + { + temp_tensors[1] = _expand_tensor_dim( inputs[1]->t, + (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); + CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final ); + } + else + { + uint8_t * data = NULL; + vsi_nn_tensor_attr_t attr; + uint32_t i; + + data = vsi_nn_ConvertTensorToData( graph, inputs[1] ); + CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final ); + + memcpy(&attr, &inputs[1]->attr, sizeof(vsi_nn_tensor_attr_t)); + + attr.size[0] = 1; + attr.size[1] = inputs[1]->attr.size[0]; + attr.size[2] = 1; + for (i = 1; i < inputs[1]->attr.dim_num; i++) + { + attr.size[2] *= inputs[1]->attr.size[i]; + } + attr.size[3] = 1; + attr.dim_num = 4; + attr.dtype.channel_dim = 2; + + temp_tensors[1] = vsi_nn_CreateRawTensorFromData(graph, data, &attr); + + vsi_nn_safe_free( data ); + } temp_tensors[2] = _expand_tensor_dim( outputs[0]->t, (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c index e259554..c78de9d 100644 --- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c @@ -38,27 +38,27 @@ typedef struct _sort_lut_s float val; } sort_lut; -static float exp_eval(float val) +static float exp_eval(float val, float alpha) { return expf(val); } -static float log_eval(float data) +static float log_eval(float data, float alpha) { return logf(data); } -static float elu_eval(float data) +static float elu_eval(float data, float alpha) { - return data >=0 ? data : expf(data) - 1; + return data >=0 ? data : expf(data) * alpha - alpha; } -static float neg_eval(float data) +static float neg_eval(float data, float alpha) { return data * -1.0f; } -static float hsigmoid_eval(float data) +static float hsigmoid_eval(float data, float alpha) { data = (float)(0.2 * data + 0.5); data = vsi_nn_clamp(data, 0, 1); @@ -66,14 +66,14 @@ static float hsigmoid_eval(float data) return data; } -static float soft_plus_eval(float data) +static float soft_plus_eval(float data, float alpha) { - return log_eval(exp_eval(data) + 1); + return log_eval(exp_eval(data, alpha) + 1, alpha); } -static float mish_eval(float data) +static float mish_eval(float data, float alpha) { - data = (float)(data * tanh(soft_plus_eval(data))); + data = (float)(data * tanh(soft_plus_eval(data, alpha))); return data; } @@ -96,7 +96,7 @@ static int32_t _lut_comparator(const void *pa, const void *pb) return 0; } -static void _set_unary_table_lookup(float func(float), float *index, float *value) +static void _set_unary_table_lookup(float func(float, float), float *index, float *value, float alpha) { #define VSI_NN_MAX_LUT_SIZE (1024) #define FLT16_MAX (57344) @@ -108,25 +108,25 @@ static void _set_unary_table_lookup(float func(float), float *index, float *valu { int16_t val = (int16_t)(i << 6); lut[i].index = fp16_to_fp32(val); - lut[i].val = func(lut[i].index); + lut[i].val = func(lut[i].index, alpha); } for (i = 0x0; i < 0x10; i++) { lut[i].index = 0; - lut[i].val = func(lut[i].index); + lut[i].val = func(lut[i].index, alpha); } for (i = 0x1F0; i < 0x200; i++) { lut[i].index = FLT16_MAX; - lut[i].val = func(lut[i].index); + lut[i].val = func(lut[i].index, alpha); } for (i = 0x3F0; i < 0x400; i++) { lut[i].index = FLT16_MIN; - lut[i].val = func(lut[i].index); + lut[i].val = func(lut[i].index, alpha); } qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator); @@ -154,13 +154,14 @@ static vsi_nn_kernel_node_t _setup size_t output_num, const vsi_nn_kernel_param_t * params, vsi_nn_kernel_t * kernel, - float func(float) + float func(float, float) ) { #ifdef VX_USER_LOOKUP_TABLE_SUPPORT vx_lut lut1 = NULL; vx_lut lut2 = NULL; vx_node node = NULL; + float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); float index[1024] = {0}; float value[1024] = {0}; @@ -172,7 +173,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - _set_unary_table_lookup(func, index, value); + _set_unary_table_lookup(func, index, value, alpha); lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl index c702951..68febfb 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl @@ -1,12 +1,12 @@ -float4 eltwise_unary_sin(float4 x) +float4 eltwise_unary_sin(float4 x, float alpha) { return native_sin(x); } #define logE (1.44269502f) #define twoLogE (logE * 2.0f) -float4 eltwise_unary_exp(float4 x) +float4 eltwise_unary_exp(float4 x, float alpha) { x *= logE; x = exp2(x); @@ -14,33 +14,33 @@ float4 eltwise_unary_exp(float4 x) } #define rlogE (0.693147182f) -float4 eltwise_unary_log(float4 x) +float4 eltwise_unary_log(float4 x, float alpha) { x = log2(x); return x * rlogE; } -float4 eltwise_unary_elu(float4 val) +float4 eltwise_unary_elu(float4 val, float alpha) { float4 x = val * logE; - x = exp2(x) - 1; + x = exp2(x) * alpha - alpha; return val < 0 ? x : val; } -float4 eltwise_unary_neg(float4 x) +float4 eltwise_unary_neg(float4 x, float alpha) { return x * -1; } -float4 eltwise_unary_hard_sigmoid(float4 x) +float4 eltwise_unary_hard_sigmoid(float4 x, float alpha) { x = 0.2 * x + 0.5; x = clamp(x, 0, 1); return x; } -float4 _softrelu(float4 x) +float4 _softrelu(float4 x, float alpha) { x *= logE; x = exp2(x); @@ -49,7 +49,7 @@ float4 _softrelu(float4 x) return x * rlogE; } -float4 _tanh(float4 x) +float4 _tanh(float4 x, float alpha) { x *= -twoLogE; x = 1 + exp2(x); @@ -57,10 +57,10 @@ float4 _tanh(float4 x) return (2 * x - 1); } -float4 eltwise_unary_mish(float4 x) +float4 eltwise_unary_mish(float4 x, float alpha) { - float4 y = _softrelu(x); - x = x * _tanh(y); + float4 y = _softrelu(x, alpha); + x = x * _tanh(y, alpha); return x; } @@ -72,14 +72,15 @@ __kernel void func_name##_F32toF32 \ float inputScale, \ float inputTail, \ float outputScale, \ - float outputZP \ + float outputZP, \ + float alpha \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ \ float4 src = read_imagef(input, coord); \ \ - float4 dst = eltwise_unary_##func_name(src); \ + float4 dst = eltwise_unary_##func_name(src, alpha); \ \ write_imagef(output, coord, dst); \ } @@ -99,14 +100,15 @@ __kernel void func_name##_F32toF32_2D \ float inputScale, \ float inputTail, \ float outputScale, \ - float outputZP \ + float outputZP, \ + float alpha \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ \ float4 src = read_imagef(input, coord); \ \ - float4 dst = eltwise_unary_##func_name(src); \ + float4 dst = eltwise_unary_##func_name(src, alpha); \ \ write_imagef(output, coord, dst); \ } @@ -126,7 +128,8 @@ __kernel void func_name##_U8toU8 \ float inputScale, \ float inputTail, \ float outputScale, \ - float outputZP \ + float outputZP, \ + float alpha \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ @@ -134,7 +137,7 @@ __kernel void func_name##_U8toU8 \ uint4 src = read_imageui(input, coord); \ float4 data = convert_float4(src) * inputScale - inputTail; \ \ - data = eltwise_unary_##func_name(data); \ + data = eltwise_unary_##func_name(data, alpha); \ uint4 dst = convert_uint4(data * outputScale + outputZP); \ \ write_imageui(output, coord, dst); \ @@ -155,7 +158,8 @@ __kernel void func_name##_U8toU8_2D \ float inputScale, \ float inputTail, \ float outputScale, \ - float outputZP \ + float outputZP, \ + float alpha \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ @@ -163,7 +167,7 @@ __kernel void func_name##_U8toU8_2D \ uint4 src = read_imageui(input, coord); \ float4 data = convert_float4(src) * inputScale - inputTail; \ \ - data = eltwise_unary_##func_name(data); \ + data = eltwise_unary_##func_name(data, alpha); \ uint4 dst = convert_uint4(data * outputScale + outputZP); \ \ write_imageui(output, coord, dst); \ @@ -184,7 +188,8 @@ __kernel void neg_I32toI32 float inputScale, float inputTail, float outputScale, - float outputZP + float outputZP, + float alpha ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); @@ -202,7 +207,8 @@ __kernel void neg_I32toI32_2D float inputScale, float inputTail, float outputScale, - float outputZP + float outputZP, + float alpha ) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_1d_bilinear.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_1d_bilinear.cl new file mode 100644 index 0000000..c7cbde2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_1d_bilinear.cl @@ -0,0 +1,57 @@ +__kernel void resize_1d_bilinear_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float half_pixel_value + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; + float left_x_f = floor(in_x); + float x_lerp = in_x - left_x_f; + int left_x_idx = convert_int(left_x_f); + int4 coord_in = (int4)(left_x_idx, coord_out.y, coord_out.z, 0); + float4 top_l, top_r, top, bottom, dst; + + top_l = read_imagef(input, coord_in); + coord_in.x++; + top_r = read_imagef(input, coord_in); + + top_r = top_r - top_l; + dst = top_l + x_lerp * top_r; + + write_imagef(output, coord_out, dst); + +} + + +__kernel void resize_1d_bilinear_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float half_pixel_value, + float in_scale, + float in_tail, + float out_scale, + float out_tail + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; + float left_x_f = floor(in_x); + float x_lerp = in_x - left_x_f; + int left_x_idx = convert_int(left_x_f); + int4 coord_in = (int4)(left_x_idx, coord_out.y, coord_out.z, 0); + float4 top_l, top_r, top; + uint4 dst; + + top_l = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail; + coord_in.x++; + top_r = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail; + + top_r = top_r - top_l; + top = top_l + x_lerp * top_r; + dst = convert_uint4(top * out_scale + out_tail); + + write_imageui(output, coord_out, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_1d_nearest.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_1d_nearest.cl new file mode 100644 index 0000000..f6ac8b9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_1d_nearest.cl @@ -0,0 +1,36 @@ + +#define NEAREST_INDEX_PROCESS() \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x + round_value; \ + int in_x_idx = convert_int(in_x); \ + +__kernel void resize_1d_nearest_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float half_pixel_value, + float round_value) +{ + NEAREST_INDEX_PROCESS() + int4 coord_in = (int4)(in_x_idx, coord_out.y, coord_out.z, 0); + float4 dst; + dst = read_imagef(input, coord_in); + write_imagef(output, coord_out, dst); +} + + +__kernel void resize_1d_nearest_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float half_pixel_value, + float round_value, + float output_scale, + float output_tail) +{ + NEAREST_INDEX_PROCESS() + int4 coord_in = (int4)(in_x_idx, coord_out.y, coord_out.z, 0); + uint4 dst; + dst = convert_uint4(convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail); + write_imageui(output, coord_out, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx index bc3b6c4..8b03b5c 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx @@ -1,5 +1,7 @@ #include "cl_viv_vx_ext.h" +_viv_uniform float alpha; + float4 eltwise_unary_sin(float4 x) { return native_sin(x); @@ -24,7 +26,7 @@ float4 eltwise_unary_log(float4 x) float4 eltwise_unary_elu(float4 val) { float4 x = val * logE; - x = exp2(x) - 1; + x = exp2(x) * alpha - alpha; return val < 0 ? x : val; } @@ -78,7 +80,8 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; __kernel void func_name##_##src_type_name##to##dst_type_name##_2D( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ - int type \ + int type, \ + float _alpha \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ @@ -194,7 +197,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8; __kernel void func_name##_BF16toBF16_2D( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ - int type \ + int type, \ + float _alpha \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx index 832c948..f452849 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx @@ -1,5 +1,7 @@ #include "cl_viv_vx_ext.h" +_viv_uniform float alpha; + float4 eltwise_unary_sin(float4 x) { return native_sin(x); @@ -24,7 +26,7 @@ float4 eltwise_unary_log(float4 x) float4 eltwise_unary_elu(float4 val) { float4 x = val * logE; - x = exp2(x) - 1; + x = exp2(x) * alpha - alpha; return val < 0 ? x : val; } @@ -78,7 +80,8 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; __kernel void func_name##_##src_type_name##to##dst_type_name( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ - int type \ + int type, \ + float _alpha \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ @@ -192,7 +195,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8; __kernel void func_name##_BF16toBF16( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ - int type \ + int type, \ + float _alpha \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx new file mode 100644 index 0000000..cfbae00 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx @@ -0,0 +1,148 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float scale_x; +_viv_uniform int out_height; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform float half_pixel_value; + +__kernel void resize_1d_bilinear_BF16toBF16_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + + vxc_short8 top; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + do + { + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 src; + float4 left4; + float4 right4; + float4 dst4; + + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); + _viv_asm(COPY, right4, src, 16); + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); + _viv_asm(COPY, left4, src, 16); + right4 -= left4; + dst4 = right4 * x_lerp + left4; + vxc_ushort8 tmp, dst; + _viv_asm(COPY, tmp, dst4, 16); + dst.s0123 = tmp.s1357; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_in.y++; + coord_out.y ++; + } while (coord_out.y < out_height); +} + +__kernel void resize_1d_bilinear_BF16toBF16_UP + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + + vxc_ushort8 src0, src1, dst0; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 16; + VXC_DP2x8(maskShift, bitextract_p0, constData, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + do + { + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.y ++; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 dst_tmp; + float4 left4; + float4 right4; + + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, left4, dst_tmp, 16); + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, right4, dst_tmp, 16); + right4 -= left4; + float4 dst4 = right4 * x_lerp + left4; + + vxc_ushort8 tmp, dst; + _viv_asm(COPY, tmp, dst4, 16); + dst.s0123 = tmp.s1357; + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + } while (coord_out.y < out_height); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_DOWN_NX.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_DOWN_NX.vx new file mode 100644 index 0000000..e8cc06c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_DOWN_NX.vx @@ -0,0 +1,136 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniResizeNxDown_2x8; +_viv_uniform int out_height; + +#define RESIZE_1D_NX_DOWN_8BIT_SAME_PROCESS(read_type, data_type) \ + read_type read_data, save_data; \ + data_type in0, result; \ + \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + \ + while (coord_out.y < out_height) \ + { \ + VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, in0, read_data, 16); \ + VXC_DP2x8(result, in0, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxDown_2x8); \ + _viv_asm(COPY, save_data, result, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + coord_in.y++; \ + coord_out.y++; \ + } \ + +#define RESIZE_1D_2X_DOWN_8BIT_HALF_SAME(name0, name1, read_type, data_type) \ +__kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_HALF_SAME \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int scale_type \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + coord_in.x = coord_out.x << 1; \ + RESIZE_1D_NX_DOWN_8BIT_SAME_PROCESS(read_type, data_type) \ +} + +RESIZE_1D_2X_DOWN_8BIT_HALF_SAME(U8, U8, vxc_uchar16, vxc_uchar16) +RESIZE_1D_2X_DOWN_8BIT_HALF_SAME(I8, I8, vxc_char16, vxc_char16) + + + +#define RESIZE_1D_2X_DOWN_8BIT_SAME(name0, name1, read_type, data_type) \ +__kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_SAME \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int scale_type \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + coord_in.x = coord_out.x << 1; \ + RESIZE_1D_NX_DOWN_8BIT_SAME_PROCESS(read_type, data_type) \ +} + +RESIZE_1D_2X_DOWN_8BIT_SAME(U8, U8, vxc_uchar16, vxc_uchar16) +RESIZE_1D_2X_DOWN_8BIT_SAME(I8, I8, vxc_char16, vxc_char16) + + +#define RESIZE_1D_NX_DOWN_16BIT_SAME_PROCESS(read_type, data_type) \ + read_type read_data, read_data1, save_data; \ + data_type in0, in1, result; \ + \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + \ + while (coord_out.y < out_height) \ + { \ + VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, in0, read_data, 16); \ + VXC_OP4(img_load_3d, read_data1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, in1, read_data1, 16); \ + VXC_DP2x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxDown_2x8); \ + _viv_asm(COPY, save_data, result, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + coord_in.y++; \ + coord_out.y++; \ + } \ + +#define RESIZE_1D_2X_DOWN_16BIT_HALF_SAME(name0, name1, read_type, data_type) \ +__kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_HALF_SAME \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int scale_type \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + coord_in.x = coord_out.x << 1; \ + RESIZE_1D_NX_DOWN_16BIT_SAME_PROCESS(read_type, data_type) \ +} + +RESIZE_1D_2X_DOWN_16BIT_HALF_SAME(I16, I16, vxc_short8, vxc_short8) +RESIZE_1D_2X_DOWN_16BIT_HALF_SAME(F16, F16, vxc_short8, vxc_half8) + + + +#define RESIZE_1D_2X_DOWN_16BIT_SAME(name0, name1, read_type, data_type) \ +__kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_SAME \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int scale_type \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + coord_in.x = coord_out.x << 1; \ + RESIZE_1D_NX_DOWN_16BIT_SAME_PROCESS(read_type, data_type) \ +} + +RESIZE_1D_2X_DOWN_16BIT_SAME(I16, I16, vxc_short8, vxc_short8) +RESIZE_1D_2X_DOWN_16BIT_SAME(F16, F16, vxc_short8, vxc_half8) + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx new file mode 100644 index 0000000..3487679 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx @@ -0,0 +1,216 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform VXC_512Bits uniRightSubLeft_4x4; +_viv_uniform VXC_512Bits uniExtactHalf8_2x8; +_viv_uniform float scale_x; +_viv_uniform int out_height; +_viv_uniform float uint8Scale; +_viv_uniform float output_ZP; +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform float half_pixel_value; +_viv_uniform VXC_512Bits uniConvertFp2FP32_left_4x4; +_viv_uniform VXC_512Bits uniConvertFp2FP32_right_4x4; + +__kernel void resize_1d_bilinear_F16toF16_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 left4; + float4 right4; + + vxc_ushort8 src, result; + vxc_half8 src_half, dst; + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + do + { + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, src_half, src, 16); + + VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4); + VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4); + right4 -= left4; + float4 dst4 = right4 * x_lerp + left4; + + half4 tmp; + _viv_asm(CONV, tmp, dst4); + VXC_DP2x8(dst, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, result, dst, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_in.y++; + coord_out.y ++; + } while (coord_out.y < out_height); + +} + +__kernel void resize_1d_bilinear_F16toU8_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 left4; + float4 right4; + + vxc_ushort8 src; + vxc_uchar8 result; + vxc_half8 src_half, dst; + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + do + { + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, src_half, src, 16); + + VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4); + VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4); + right4 -= left4; + float4 dst4 = right4 * x_lerp + left4; + + dst4 = dst4 * uint8Scale + output_ZP; + int4 dst = convert_int4_rte(dst4); + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_in.y++; + coord_out.y ++; + } while (coord_out.y < out_height); +} + +__kernel void resize_1d_bilinear_F16toF16_UP + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + + vxc_ushort8 src0, src1, dst0; + vxc_half8 top; + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 16; + VXC_DP2x8(maskShift, bitextract_p0, constData, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + do + { + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top, dst0, 16); + + coord_in.y ++; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 left4; + float4 right4; + + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4); + float4 dst4 = right4 * x_lerp + left4; + + half4 tmp; + _viv_asm(CONV, tmp, dst4); + VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, dst0, top, 16); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + } while (coord_out.y < out_height); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx new file mode 100644 index 0000000..956dc62 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx @@ -0,0 +1,147 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform float scale_x; +_viv_uniform int out_height; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4; +_viv_uniform float dfpScale; +_viv_uniform float half_pixel_value; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_left_4x4; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_right_4x4; + +__kernel void resize_1d_bilinear_I16toI16_UP + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + + vxc_ushort8 src0, src1, dst0; + + vxc_short8 top; + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 16; + VXC_DP2x8(maskShift, bitextract_p0, constData, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + do + { + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top, dst0, 16); + + float4 left4; + float4 right4; + + coord_in.y ++; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4); + float4 dst4 = right4 * x_lerp + left4; + dst4 = dst4 * dfpScale; + int4 dst = convert_int4_rte(dst4); + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.y ++; + } while (coord_out.y < out_height); +} + +__kernel void resize_1d_bilinear_I16toI16_DOWN + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + vxc_short8 src; + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + float4 left4; + float4 right4; + vxc_short8 result; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + + do + { + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4); + VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_right_4x4); + right4 -= left4; + float4 dst4 = right4 * x_lerp + left4; + dst4 = dst4 * dfpScale; + + int4 dst = convert_int4_rte(dst4); + + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_in.y++; + coord_out.y ++; + } while (coord_out.y < out_height); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx new file mode 100644 index 0000000..e25071c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx @@ -0,0 +1,148 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform float scale_x; +_viv_uniform int out_height; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4; +_viv_uniform float dfpScale; +_viv_uniform float half_pixel_value; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_left_4x4; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_right_4x4; + +__kernel void resize_1d_bilinear_I8toI8_UP + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + + vxc_uchar16 src0, dst0; + + vxc_char16 top; + + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 8; + VXC_DP2x8(maskShift, bitextract_p0, constData, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + do + { + VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top, dst0, 16); + + coord_in.y++; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + float4 left4; + float4 right4; + + VXC_DP4x4(left4, top, top, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_DP4x4(right4, top, top, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4); + + float4 dst4 = right4 * x_lerp + left4; + + dst4 = dst4 * dfpScale; + int4 dst = convert_int4_rte(dst4); + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_out.y ++; + } while (coord_out.y < out_height); +} + +__kernel void resize_1d_bilinear_I8toI8_DOWN + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + vxc_char16 src; + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + float4 left4; + float4 right4; + vxc_char16 result; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + + do + { + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4); + VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_right_4x4); + right4 -= left4; + float4 dst4 = right4 * x_lerp + left4; + dst4 = dst4 * dfpScale; + + int4 dst = convert_int4_rte(dst4); + + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_in.y++; + coord_out.y ++; + } while (coord_out.y < out_height); + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx new file mode 100644 index 0000000..b25fba9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx @@ -0,0 +1,212 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_4x4; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_right_4x4; +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform float scale_x; +_viv_uniform int out_height; +_viv_uniform int input_ZP; +_viv_uniform float uint8Scale; +_viv_uniform float output_ZP; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform float half_pixel_value; + +__kernel void resize_1d_bilinear_U8toF16_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + vxc_uchar16 src; + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + float4 left4; + float4 right4; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + unsigned char inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + + do + { + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_right_4x4); + right4 -= left4; + float4 dst4 = right4 * x_lerp + left4; + dst4 *= uint8Scale; + half4 dst; + _viv_asm(CONV, dst, dst4); + vxc_short8 dst_short; + _viv_asm(COPY, dst_short, dst, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_in.y++; + coord_out.y ++; + } while (coord_out.y < out_height); +} + +__kernel void resize_1d_bilinear_U8toU8_UP + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + + + vxc_uchar16 src0, src1; + + vxc_uchar16 top; + + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 8; + VXC_DP2x8(maskShift, bitextract_p0, constData, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + do + { + VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y++; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + float4 left4; + float4 right4; + + unsigned char inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4); + + float4 dst4 = right4 * x_lerp + left4; + dst4 = dst4 * uint8Scale + output_ZP; + int4 dst = convert_int4_rte(dst4); + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_out.y ++; + } while (coord_out.y < out_height); +} + +__kernel void resize_1d_bilinear_U8toU8_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + vxc_uchar16 src; + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + float4 left4; + float4 right4; + vxc_uchar16 result; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + unsigned char inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + + do + { + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_right_4x4); + right4 -= left4; + float4 dst4 = right4 * x_lerp + left4; + dst4 = dst4 * uint8Scale + output_ZP; + int4 dst = convert_int4_rte(dst4); + + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_in.y++; + coord_out.y ++; + } while (coord_out.y < out_height); + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8_opt.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8_opt.vx new file mode 100644 index 0000000..ab7e74b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8_opt.vx @@ -0,0 +1,78 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float scale_x; +_viv_uniform int out_height; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform VXC_512Bits uniBilinear_4x4; +_viv_uniform float half_pixel_value; + +__kernel void resize_1d_bilinear_U8toU8_UP_opt + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers, + __read_only image2d_array_t scale + ) +{ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(2), 0); + + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value; + + float4 left_x_f = floor(in_x); + int4 left_x_idx = convert_int4(left_x_f); + int4 right_x_idx = left_x_idx + 1; + + vxc_uchar16 src0; + + vxc_uchar16 src_mask; + + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 8; + VXC_DP2x8(maskShift, bitextract_p0, constData, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + vxc_ushort8 lerp_0; + vxc_half8 lerp; + + int2 coord = (int2)(coord_out.x * 2, 0); + VXC_ReadImage(lerp_0, scale, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, lerp, lerp_0, 16); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + do + { + VXC_BitExtract(src_mask, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.y++; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 dst; + VXC_DP4x4(dst, src_mask, lerp, + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_out.y ++; + } while (coord_out.y < out_height); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_UP_NX.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_UP_NX.vx new file mode 100644 index 0000000..3ddd305 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_UP_NX.vx @@ -0,0 +1,155 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniResizeNxUp_2x8; +_viv_uniform int out_height; + +#define RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \ + read_type read_data, save_data; \ + data_type in0, result; \ + \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + \ + while (coord_out.y < out_height) \ + { \ + VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, in0, read_data, 16); \ + VXC_DP2x8(result, in0, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxUp_2x8); \ + _viv_asm(COPY, save_data, result, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + coord_in.y++; \ + coord_out.y++; \ + } \ + +#define RESIZE_1D_2X_HALF_SAME(name0, name1, read_type, data_type) \ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_2X_HALF_SAME \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int scale_type \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + coord_in.x = (coord_out.x * 2 - 1) >> 2; \ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; \ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \ +} + +RESIZE_1D_2X_HALF_SAME(U8, U8, vxc_uchar16, vxc_uchar16) +RESIZE_1D_2X_HALF_SAME(I8, I8, vxc_char16, vxc_char16) +RESIZE_1D_2X_HALF_SAME(I16, I16, vxc_short8, vxc_short8) +RESIZE_1D_2X_HALF_SAME(F16, F16, vxc_short8, vxc_half8) + + +#define RESIZE_1D_2X_SAME(name0, name1, read_type, data_type) \ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_2X_SAME \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int scale_type \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + coord_in.x = coord_out.x >> 1; \ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \ +} + +RESIZE_1D_2X_SAME(U8, U8, vxc_uchar16, vxc_uchar16) +RESIZE_1D_2X_SAME(I8, I8, vxc_char16, vxc_char16) +RESIZE_1D_2X_SAME(I16, I16, vxc_short8, vxc_short8) +RESIZE_1D_2X_SAME(F16, F16, vxc_short8, vxc_half8) + + +#define RESIZE_1D_4X_HALF_SAME(name0, name1, read_type, data_type) \ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_4X_HALF_SAME \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int scale_type \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + coord_in.x = (coord_out.x * 2 - 3) >> 3; \ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; \ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \ +} + +RESIZE_1D_4X_HALF_SAME(U8, U8, vxc_uchar16, vxc_uchar16) +RESIZE_1D_4X_HALF_SAME(I8, I8, vxc_char16, vxc_char16) +RESIZE_1D_4X_HALF_SAME(I16, I16, vxc_short8, vxc_short8) +RESIZE_1D_4X_HALF_SAME(F16, F16, vxc_short8, vxc_half8) + + +#define RESIZE_1D_4X_SAME(name0, name1, read_type, data_type) \ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_4X_SAME \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int scale_type \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + coord_in.x = coord_out.x >> 2; \ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \ +} + +RESIZE_1D_4X_SAME(U8, U8, vxc_uchar16, vxc_uchar16) +RESIZE_1D_4X_SAME(I8, I8, vxc_char16, vxc_char16) +RESIZE_1D_4X_SAME(I16, I16, vxc_short8, vxc_short8) +RESIZE_1D_4X_SAME(F16, F16, vxc_short8, vxc_half8) + + +#define RESIZE_1D_8X_HALF_SAME(name0, name1, read_type, data_type) \ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_8X_HALF_SAME \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int scale_type \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + coord_in.x = (coord_out.x * 2 - 7) >> 4; \ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; \ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \ +} + +RESIZE_1D_8X_HALF_SAME(U8, U8, vxc_uchar16, vxc_uchar16) +RESIZE_1D_8X_HALF_SAME(I8, I8, vxc_char16, vxc_char16) +RESIZE_1D_8X_HALF_SAME(I16, I16, vxc_short8, vxc_short8) +RESIZE_1D_8X_HALF_SAME(F16, F16, vxc_short8, vxc_half8) + + +#define RESIZE_1D_8X_SAME(name0, name1, read_type, data_type) \ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_8X_SAME \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int scale_type \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + coord_in.x = coord_out.x >> 3; \ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \ +} + +RESIZE_1D_8X_SAME(U8, U8, vxc_uchar16, vxc_uchar16) +RESIZE_1D_8X_SAME(I8, I8, vxc_char16, vxc_char16) +RESIZE_1D_8X_SAME(I16, I16, vxc_short8, vxc_short8) +RESIZE_1D_8X_SAME(F16, F16, vxc_short8, vxc_half8) + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_nearest.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_nearest.vx new file mode 100644 index 0000000..75d0c47 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_nearest.vx @@ -0,0 +1,337 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8; +_viv_uniform float scale_x; +_viv_uniform float half_pixel_value; +_viv_uniform float round_value; +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp + +#define NEAREST_INDEX_PROCESS() \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); \ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x + round_value; \ + int4 in_x_idx = convert_int4(in_x); \ + + +__kernel void resize_1d_nearest_F16toF16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_short8 src; + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.y; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.z; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.w; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniGetExtractData_2x8; +__kernel void resize_1d_nearest_F16toF16_op + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_ushort8 src0, src1, dst; + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16); + vxc_ushort8 input_idx; + _viv_asm(COPY, input_idx, in_x_idx, 16); + VXC_DP2x8(mask, input_idx, input_idx, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8); + VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniConvertI8toI8_2x8; +__kernel void resize_1d_nearest_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_char16 src; + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.y; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.z; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.w; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_1d_nearest_I8toI8_op + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_uchar16 src0, dst0; + vxc_char16 dst; + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); + vxc_ushort8 input_idx; + _viv_asm(COPY, input_idx, in_x_idx, 16); + VXC_DP2x8(mask, input_idx, input_idx, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8); + VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, dst, dst0, 8); + VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_1d_nearest_U8toU8 + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_uchar16 src; + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.y; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.z; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.w; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 multiplier; + _viv_asm(COPY, multiplier, multAndoutZP, 16); + VXC_DP2x8(src, src, multiplier, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_1d_nearest_U8toU8_op + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_uchar16 src0, dst; + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); + vxc_ushort8 input_idx; + _viv_asm(COPY, input_idx, in_x_idx, 16); + VXC_DP2x8(mask, input_idx, input_idx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8); + VXC_BitExtract(dst, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + vxc_ushort8 multiplier; + _viv_asm(COPY, multiplier, multAndoutZP, 16); + VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_1d_nearest_I16toI16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_short8 src; + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.y; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.z; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.w; + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_1d_nearest_I16toI16_op + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_ushort8 src0, src1, dst0; + vxc_short8 dst; + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16); + vxc_ushort8 input_idx; + _viv_asm(COPY, input_idx, in_x_idx, 16); + VXC_DP2x8(mask, input_idx, input_idx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8); + VXC_BitExtract(dst0, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, dst, dst0, 8); + VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index 1d65b5e..366041c 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -3177,6 +3177,8 @@ __kernel void detect_post_box_U8_U8toF32(\n\ static const char eltwise_unary_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ +_viv_uniform float alpha;\n\ +\n\ float4 eltwise_unary_sin(float4 x)\n\ {\n\ return native_sin(x);\n\ @@ -3201,7 +3203,7 @@ float4 eltwise_unary_log(float4 x)\n\ float4 eltwise_unary_elu(float4 val)\n\ {\n\ float4 x = val * logE;\n\ - x = exp2(x) - 1;\n\ + x = exp2(x) * alpha - alpha;\n\ \n\ return val < 0 ? x : val;\n\ }\n\ @@ -3255,7 +3257,8 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ __kernel void func_name##_##src_type_name##to##dst_type_name##_2D( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ - int type \\\n\ + int type, \\\n\ + float _alpha \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ @@ -3371,7 +3374,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ __kernel void func_name##_BF16toBF16_2D( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ - int type \\\n\ + int type, \\\n\ + float _alpha \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ @@ -3412,6 +3416,8 @@ ELTSISE_UNARY_BF16_2D(hard_sigmoid)\n\ static const char eltwise_unary_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ +_viv_uniform float alpha;\n\ +\n\ float4 eltwise_unary_sin(float4 x)\n\ {\n\ return native_sin(x);\n\ @@ -3436,7 +3442,7 @@ float4 eltwise_unary_log(float4 x)\n\ float4 eltwise_unary_elu(float4 val)\n\ {\n\ float4 x = val * logE;\n\ - x = exp2(x) - 1;\n\ + x = exp2(x) * alpha - alpha;\n\ \n\ return val < 0 ? x : val;\n\ }\n\ @@ -3490,7 +3496,8 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ __kernel void func_name##_##src_type_name##to##dst_type_name( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ - int type \\\n\ + int type, \\\n\ + float _alpha \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ @@ -3604,7 +3611,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ __kernel void func_name##_BF16toBF16( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ - int type \\\n\ + int type, \\\n\ + float _alpha \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ @@ -28076,6 +28084,1601 @@ TENSOR_KERAS_RELU(U8, U8, _2D, Image, U8toF32, F32toU8, vxc_uchar TENSOR_KERAS_RELU(U8, F16, _2D, Image, U8toF32, F32toF16, vxc_uchar8, vxc_ushort8)\n\ "; /* end of relu_keras_vx*/ +static const char resize_1d_bilinear_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float scale_x;\n\ +_viv_uniform int out_height;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform float half_pixel_value;\n\ +\n\ +__kernel void resize_1d_bilinear_BF16toBF16_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ +\n\ + vxc_short8 top;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + do\n\ + {\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 src;\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 dst4;\n\ +\n\ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8);\n\ + _viv_asm(COPY, right4, src, 16);\n\ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8);\n\ + _viv_asm(COPY, left4, src, 16);\n\ + right4 -= left4;\n\ + dst4 = right4 * x_lerp + left4;\n\ + vxc_ushort8 tmp, dst;\n\ + _viv_asm(COPY, tmp, dst4, 16);\n\ + dst.s0123 = tmp.s1357;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_in.y++;\n\ + coord_out.y ++;\n\ + } while (coord_out.y < out_height);\n\ +}\n\ +\n\ +__kernel void resize_1d_bilinear_BF16toBF16_UP\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ +\n\ + vxc_ushort8 src0, src1, dst0;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 16;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y ++;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 dst_tmp;\n\ + float4 left4;\n\ + float4 right4;\n\ +\n\ + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, left4, dst_tmp, 16);\n\ + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, right4, dst_tmp, 16);\n\ + right4 -= left4;\n\ + float4 dst4 = right4 * x_lerp + left4;\n\ +\n\ + vxc_ushort8 tmp, dst;\n\ + _viv_asm(COPY, tmp, dst4, 16);\n\ + dst.s0123 = tmp.s1357;\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + } while (coord_out.y < out_height);\n\ +}\n\ +"; /* end of resize_1d_bilinear_BF16_vx*/ + +static const char resize_1d_bilinear_DOWN_NX_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniResizeNxDown_2x8;\n\ +_viv_uniform int out_height;\n\ +\n\ +#define RESIZE_1D_NX_DOWN_8BIT_SAME_PROCESS(read_type, data_type) \\\n\ + read_type read_data, save_data; \\\n\ + data_type in0, result; \\\n\ + \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + \\\n\ + while (coord_out.y < out_height) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, in0, read_data, 16); \\\n\ + VXC_DP2x8(result, in0, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxDown_2x8); \\\n\ + _viv_asm(COPY, save_data, result, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_in.y++; \\\n\ + coord_out.y++; \\\n\ + } \\\n\ +\n\ +#define RESIZE_1D_2X_DOWN_8BIT_HALF_SAME(name0, name1, read_type, data_type) \\\n\ +__kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_HALF_SAME \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int scale_type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + coord_in.x = coord_out.x << 1; \\\n\ + RESIZE_1D_NX_DOWN_8BIT_SAME_PROCESS(read_type, data_type) \\\n\ +}\n\ +\n\ +RESIZE_1D_2X_DOWN_8BIT_HALF_SAME(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +RESIZE_1D_2X_DOWN_8BIT_HALF_SAME(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +\n\ +\n\ +#define RESIZE_1D_2X_DOWN_8BIT_SAME(name0, name1, read_type, data_type) \\\n\ +__kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_SAME \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int scale_type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + coord_in.x = coord_out.x << 1; \\\n\ + RESIZE_1D_NX_DOWN_8BIT_SAME_PROCESS(read_type, data_type) \\\n\ +}\n\ +\n\ +RESIZE_1D_2X_DOWN_8BIT_SAME(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +RESIZE_1D_2X_DOWN_8BIT_SAME(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +\n\ +#define RESIZE_1D_NX_DOWN_16BIT_SAME_PROCESS(read_type, data_type) \\\n\ + read_type read_data, read_data1, save_data; \\\n\ + data_type in0, in1, result; \\\n\ + \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + \\\n\ + while (coord_out.y < out_height) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, in0, read_data, 16); \\\n\ + VXC_OP4(img_load_3d, read_data1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, in1, read_data1, 16); \\\n\ + VXC_DP2x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxDown_2x8); \\\n\ + _viv_asm(COPY, save_data, result, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_in.y++; \\\n\ + coord_out.y++; \\\n\ + } \\\n\ +\n\ +#define RESIZE_1D_2X_DOWN_16BIT_HALF_SAME(name0, name1, read_type, data_type) \\\n\ +__kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_HALF_SAME \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int scale_type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + coord_in.x = coord_out.x << 1; \\\n\ + RESIZE_1D_NX_DOWN_16BIT_SAME_PROCESS(read_type, data_type) \\\n\ +}\n\ +\n\ +RESIZE_1D_2X_DOWN_16BIT_HALF_SAME(I16, I16, vxc_short8, vxc_short8)\n\ +RESIZE_1D_2X_DOWN_16BIT_HALF_SAME(F16, F16, vxc_short8, vxc_half8)\n\ +\n\ +\n\ +\n\ +#define RESIZE_1D_2X_DOWN_16BIT_SAME(name0, name1, read_type, data_type) \\\n\ +__kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_SAME \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int scale_type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + coord_in.x = coord_out.x << 1; \\\n\ + RESIZE_1D_NX_DOWN_16BIT_SAME_PROCESS(read_type, data_type) \\\n\ +}\n\ +\n\ +RESIZE_1D_2X_DOWN_16BIT_SAME(I16, I16, vxc_short8, vxc_short8)\n\ +RESIZE_1D_2X_DOWN_16BIT_SAME(F16, F16, vxc_short8, vxc_half8)\n\ +\n\ +\n\ +"; /* end of resize_1d_bilinear_DOWN_NX_vx*/ + +static const char resize_1d_bilinear_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\ +_viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\ +_viv_uniform float scale_x;\n\ +_viv_uniform int out_height;\n\ +_viv_uniform float uint8Scale;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform float half_pixel_value;\n\ +_viv_uniform VXC_512Bits uniConvertFp2FP32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertFp2FP32_right_4x4;\n\ +\n\ +__kernel void resize_1d_bilinear_F16toF16_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 left4;\n\ + float4 right4;\n\ +\n\ + vxc_ushort8 src, result;\n\ + vxc_half8 src_half, dst;\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + do\n\ + {\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, src_half, src, 16);\n\ +\n\ + VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);\n\ + VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);\n\ + right4 -= left4;\n\ + float4 dst4 = right4 * x_lerp + left4;\n\ +\n\ + half4 tmp;\n\ + _viv_asm(CONV, tmp, dst4);\n\ + VXC_DP2x8(dst, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, result, dst, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_in.y++;\n\ + coord_out.y ++;\n\ + } while (coord_out.y < out_height);\n\ +\n\ +}\n\ +\n\ +__kernel void resize_1d_bilinear_F16toU8_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 left4;\n\ + float4 right4;\n\ +\n\ + vxc_ushort8 src;\n\ + vxc_uchar8 result;\n\ + vxc_half8 src_half, dst;\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + do\n\ + {\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, src_half, src, 16);\n\ +\n\ + VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);\n\ + VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);\n\ + right4 -= left4;\n\ + float4 dst4 = right4 * x_lerp + left4;\n\ +\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_in.y++;\n\ + coord_out.y ++;\n\ + } while (coord_out.y < out_height);\n\ +}\n\ +\n\ +__kernel void resize_1d_bilinear_F16toF16_UP\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ +\n\ + vxc_ushort8 src0, src1, dst0;\n\ + vxc_half8 top;\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 16;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top, dst0, 16);\n\ +\n\ + coord_in.y ++;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ +\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\ + float4 dst4 = right4 * x_lerp + left4;\n\ +\n\ + half4 tmp;\n\ + _viv_asm(CONV, tmp, dst4);\n\ + VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, dst0, top, 16);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + } while (coord_out.y < out_height);\n\ +}\n\ +"; /* end of resize_1d_bilinear_F16_vx*/ + +static const char resize_1d_bilinear_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform float scale_x;\n\ +_viv_uniform int out_height;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;\n\ +_viv_uniform float dfpScale;\n\ +_viv_uniform float half_pixel_value;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_right_4x4;\n\ +\n\ +__kernel void resize_1d_bilinear_I16toI16_UP\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ +\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ +\n\ + vxc_ushort8 src0, src1, dst0;\n\ +\n\ + vxc_short8 top;\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 16;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top, dst0, 16);\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ +\n\ + coord_in.y ++;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\ + float4 dst4 = right4 * x_lerp + left4;\n\ + dst4 = dst4 * dfpScale;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y ++;\n\ + } while (coord_out.y < out_height);\n\ +}\n\ +\n\ +__kernel void resize_1d_bilinear_I16toI16_DOWN\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + vxc_short8 src;\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ + float4 left4;\n\ + float4 right4;\n\ + vxc_short8 result;\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ +\n\ + do\n\ + {\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4);\n\ + VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_right_4x4);\n\ + right4 -= left4;\n\ + float4 dst4 = right4 * x_lerp + left4;\n\ + dst4 = dst4 * dfpScale;\n\ +\n\ + int4 dst = convert_int4_rte(dst4);\n\ +\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_in.y++;\n\ + coord_out.y ++;\n\ + } while (coord_out.y < out_height);\n\ +}\n\ +\n\ +"; /* end of resize_1d_bilinear_I16_vx*/ + +static const char resize_1d_bilinear_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform float scale_x;\n\ +_viv_uniform int out_height;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;\n\ +_viv_uniform float dfpScale;\n\ +_viv_uniform float half_pixel_value;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_right_4x4;\n\ +\n\ +__kernel void resize_1d_bilinear_I8toI8_UP\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ +\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ +\n\ + vxc_uchar16 src0, dst0;\n\ +\n\ + vxc_char16 top;\n\ +\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 8;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top, dst0, 16);\n\ +\n\ + coord_in.y++;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ +\n\ + VXC_DP4x4(left4, top, top, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_DP4x4(right4, top, top, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\ +\n\ + float4 dst4 = right4 * x_lerp + left4;\n\ +\n\ + dst4 = dst4 * dfpScale;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.y ++;\n\ + } while (coord_out.y < out_height);\n\ +}\n\ +\n\ +__kernel void resize_1d_bilinear_I8toI8_DOWN\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + vxc_char16 src;\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ + float4 left4;\n\ + float4 right4;\n\ + vxc_char16 result;\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ +\n\ + do\n\ + {\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4);\n\ + VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_right_4x4);\n\ + right4 -= left4;\n\ + float4 dst4 = right4 * x_lerp + left4;\n\ + dst4 = dst4 * dfpScale;\n\ +\n\ + int4 dst = convert_int4_rte(dst4);\n\ +\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_in.y++;\n\ + coord_out.y ++;\n\ + } while (coord_out.y < out_height);\n\ +\n\ +}\n\ +"; /* end of resize_1d_bilinear_I8_vx*/ + +static const char resize_1d_bilinear_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_right_4x4;\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform float scale_x;\n\ +_viv_uniform int out_height;\n\ +_viv_uniform int input_ZP;\n\ +_viv_uniform float uint8Scale;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform float half_pixel_value;\n\ +\n\ +__kernel void resize_1d_bilinear_U8toF16_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + vxc_uchar16 src;\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ + float4 left4;\n\ + float4 right4;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + unsigned char inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ +\n\ + do\n\ + {\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_right_4x4);\n\ + right4 -= left4;\n\ + float4 dst4 = right4 * x_lerp + left4;\n\ + dst4 *= uint8Scale;\n\ + half4 dst;\n\ + _viv_asm(CONV, dst, dst4);\n\ + vxc_short8 dst_short;\n\ + _viv_asm(COPY, dst_short, dst, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_in.y++;\n\ + coord_out.y ++;\n\ + } while (coord_out.y < out_height);\n\ +}\n\ +\n\ +__kernel void resize_1d_bilinear_U8toU8_UP\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ +\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ +\n\ +\n\ + vxc_uchar16 src0, src1;\n\ +\n\ + vxc_uchar16 top;\n\ +\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 8;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y++;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ +\n\ + unsigned char inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\ +\n\ + float4 dst4 = right4 * x_lerp + left4;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.y ++;\n\ + } while (coord_out.y < out_height);\n\ +}\n\ +\n\ +__kernel void resize_1d_bilinear_U8toU8_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + vxc_uchar16 src;\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ + float4 left4;\n\ + float4 right4;\n\ + vxc_uchar16 result;\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + unsigned char inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ +\n\ + do\n\ + {\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_right_4x4);\n\ + right4 -= left4;\n\ + float4 dst4 = right4 * x_lerp + left4;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ + int4 dst = convert_int4_rte(dst4);\n\ +\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_in.y++;\n\ + coord_out.y ++;\n\ + } while (coord_out.y < out_height);\n\ +\n\ +}\n\ +"; /* end of resize_1d_bilinear_U8_vx*/ + +static const char resize_1d_bilinear_U8_opt_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float scale_x;\n\ +_viv_uniform int out_height;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_4x4;\n\ +_viv_uniform float half_pixel_value;\n\ +\n\ +__kernel void resize_1d_bilinear_U8toU8_UP_opt\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers,\n\ + __read_only image2d_array_t scale\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\ +\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\ +\n\ + float4 left_x_f = floor(in_x);\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + int4 right_x_idx = left_x_idx + 1;\n\ +\n\ + vxc_uchar16 src0;\n\ +\n\ + vxc_uchar16 src_mask;\n\ +\n\ + int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 8;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + vxc_ushort8 lerp_0;\n\ + vxc_half8 lerp;\n\ +\n\ + int2 coord = (int2)(coord_out.x * 2, 0);\n\ + VXC_ReadImage(lerp_0, scale, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, lerp, lerp_0, 16);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(src_mask, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y++;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_DP4x4(dst, src_mask, lerp,\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.y ++;\n\ + } while (coord_out.y < out_height);\n\ +}\n\ +"; /* end of resize_1d_bilinear_U8_opt_vx*/ + +static const char resize_1d_bilinear_UP_NX_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniResizeNxUp_2x8;\n\ +_viv_uniform int out_height;\n\ +\n\ +#define RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \\\n\ + read_type read_data, save_data; \\\n\ + data_type in0, result; \\\n\ + \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + \\\n\ + while (coord_out.y < out_height) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, in0, read_data, 16); \\\n\ + VXC_DP2x8(result, in0, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxUp_2x8); \\\n\ + _viv_asm(COPY, save_data, result, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_in.y++; \\\n\ + coord_out.y++; \\\n\ + } \\\n\ +\n\ +#define RESIZE_1D_2X_HALF_SAME(name0, name1, read_type, data_type) \\\n\ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_2X_HALF_SAME \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int scale_type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + coord_in.x = (coord_out.x * 2 - 1) >> 2; \\\n\ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; \\\n\ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \\\n\ +}\n\ +\n\ +RESIZE_1D_2X_HALF_SAME(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +RESIZE_1D_2X_HALF_SAME(I8, I8, vxc_char16, vxc_char16)\n\ +RESIZE_1D_2X_HALF_SAME(I16, I16, vxc_short8, vxc_short8)\n\ +RESIZE_1D_2X_HALF_SAME(F16, F16, vxc_short8, vxc_half8)\n\ +\n\ +\n\ +#define RESIZE_1D_2X_SAME(name0, name1, read_type, data_type) \\\n\ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_2X_SAME \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int scale_type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + coord_in.x = coord_out.x >> 1; \\\n\ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \\\n\ +}\n\ +\n\ +RESIZE_1D_2X_SAME(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +RESIZE_1D_2X_SAME(I8, I8, vxc_char16, vxc_char16)\n\ +RESIZE_1D_2X_SAME(I16, I16, vxc_short8, vxc_short8)\n\ +RESIZE_1D_2X_SAME(F16, F16, vxc_short8, vxc_half8)\n\ +\n\ +\n\ +#define RESIZE_1D_4X_HALF_SAME(name0, name1, read_type, data_type) \\\n\ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_4X_HALF_SAME \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int scale_type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + coord_in.x = (coord_out.x * 2 - 3) >> 3; \\\n\ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; \\\n\ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \\\n\ +}\n\ +\n\ +RESIZE_1D_4X_HALF_SAME(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +RESIZE_1D_4X_HALF_SAME(I8, I8, vxc_char16, vxc_char16)\n\ +RESIZE_1D_4X_HALF_SAME(I16, I16, vxc_short8, vxc_short8)\n\ +RESIZE_1D_4X_HALF_SAME(F16, F16, vxc_short8, vxc_half8)\n\ +\n\ +\n\ +#define RESIZE_1D_4X_SAME(name0, name1, read_type, data_type) \\\n\ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_4X_SAME \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int scale_type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + coord_in.x = coord_out.x >> 2; \\\n\ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \\\n\ +}\n\ +\n\ +RESIZE_1D_4X_SAME(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +RESIZE_1D_4X_SAME(I8, I8, vxc_char16, vxc_char16)\n\ +RESIZE_1D_4X_SAME(I16, I16, vxc_short8, vxc_short8)\n\ +RESIZE_1D_4X_SAME(F16, F16, vxc_short8, vxc_half8)\n\ +\n\ +\n\ +#define RESIZE_1D_8X_HALF_SAME(name0, name1, read_type, data_type) \\\n\ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_8X_HALF_SAME \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int scale_type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + coord_in.x = (coord_out.x * 2 - 7) >> 4; \\\n\ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; \\\n\ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \\\n\ +}\n\ +\n\ +RESIZE_1D_8X_HALF_SAME(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +RESIZE_1D_8X_HALF_SAME(I8, I8, vxc_char16, vxc_char16)\n\ +RESIZE_1D_8X_HALF_SAME(I16, I16, vxc_short8, vxc_short8)\n\ +RESIZE_1D_8X_HALF_SAME(F16, F16, vxc_short8, vxc_half8)\n\ +\n\ +\n\ +#define RESIZE_1D_8X_SAME(name0, name1, read_type, data_type) \\\n\ +__kernel void resize_1d_bilinear_##name0##to##name1##_UP_8X_SAME \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int scale_type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + coord_in.x = coord_out.x >> 3; \\\n\ + RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \\\n\ +}\n\ +\n\ +RESIZE_1D_8X_SAME(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +RESIZE_1D_8X_SAME(I8, I8, vxc_char16, vxc_char16)\n\ +RESIZE_1D_8X_SAME(I16, I16, vxc_short8, vxc_short8)\n\ +RESIZE_1D_8X_SAME(F16, F16, vxc_short8, vxc_half8)\n\ +\n\ +\n\ +"; /* end of resize_1d_bilinear_UP_NX_vx*/ + +static const char resize_1d_nearest_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;\n\ +_viv_uniform float scale_x;\n\ +_viv_uniform float half_pixel_value;\n\ +_viv_uniform float round_value;\n\ +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +#define NEAREST_INDEX_PROCESS() \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); \\\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_x + round_value; \\\n\ + int4 in_x_idx = convert_int4(in_x); \\\n\ +\n\ +\n\ +__kernel void resize_1d_nearest_F16toF16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_short8 src;\n\ + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.y;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.z;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.w;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniGetExtractData_2x8;\n\ +__kernel void resize_1d_nearest_F16toF16_op\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_ushort8 src0, src1, dst;\n\ + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);\n\ + vxc_ushort8 input_idx;\n\ + _viv_asm(COPY, input_idx, in_x_idx, 16);\n\ + VXC_DP2x8(mask, input_idx, input_idx, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);\n\ + VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;\n\ +__kernel void resize_1d_nearest_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_char16 src;\n\ + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.y;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.z;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.w;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_1d_nearest_I8toI8_op\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_uchar16 src0, dst0;\n\ + vxc_char16 dst;\n\ + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\ + vxc_ushort8 input_idx;\n\ + _viv_asm(COPY, input_idx, in_x_idx, 16);\n\ + VXC_DP2x8(mask, input_idx, input_idx, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);\n\ + VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, dst, dst0, 8);\n\ + VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_1d_nearest_U8toU8\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_uchar16 src;\n\ + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.y;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.z;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.w;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 multiplier;\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\ + VXC_DP2x8(src, src, multiplier, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_1d_nearest_U8toU8_op\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_uchar16 src0, dst;\n\ + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\ + vxc_ushort8 input_idx;\n\ + _viv_asm(COPY, input_idx, in_x_idx, 16);\n\ + VXC_DP2x8(mask, input_idx, input_idx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);\n\ + VXC_BitExtract(dst, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + vxc_ushort8 multiplier;\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\ + VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_1d_nearest_I16toI16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_short8 src;\n\ + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.y;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.z;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.w;\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_1d_nearest_I16toI16_op\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_ushort8 src0, src1, dst0;\n\ + vxc_short8 dst;\n\ + int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);\n\ + vxc_ushort8 input_idx;\n\ + _viv_asm(COPY, input_idx, in_x_idx, 16);\n\ + VXC_DP2x8(mask, input_idx, input_idx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);\n\ + VXC_BitExtract(dst0, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, dst, dst0, 8);\n\ + VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of resize_1d_nearest_vx*/ + static const char resize_bilinear_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float2 scale_xy;\n\ @@ -36151,14 +37754,14 @@ static const char eltwise_ops_helper_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm "; /* end of eltwise_ops_helper_cl*/ static const char eltwise_unary_cl[] = "\n\ -float4 eltwise_unary_sin(float4 x)\n\ +float4 eltwise_unary_sin(float4 x, float alpha)\n\ {\n\ return native_sin(x);\n\ }\n\ \n\ #define logE (1.44269502f)\n\ #define twoLogE (logE * 2.0f)\n\ -float4 eltwise_unary_exp(float4 x)\n\ +float4 eltwise_unary_exp(float4 x, float alpha)\n\ {\n\ x *= logE;\n\ x = exp2(x);\n\ @@ -36166,33 +37769,33 @@ float4 eltwise_unary_exp(float4 x)\n\ }\n\ \n\ #define rlogE (0.693147182f)\n\ -float4 eltwise_unary_log(float4 x)\n\ +float4 eltwise_unary_log(float4 x, float alpha)\n\ {\n\ x = log2(x);\n\ return x * rlogE;\n\ }\n\ \n\ -float4 eltwise_unary_elu(float4 val)\n\ +float4 eltwise_unary_elu(float4 val, float alpha)\n\ {\n\ float4 x = val * logE;\n\ - x = exp2(x) - 1;\n\ + x = exp2(x) * alpha - alpha;\n\ \n\ return val < 0 ? x : val;\n\ }\n\ \n\ -float4 eltwise_unary_neg(float4 x)\n\ +float4 eltwise_unary_neg(float4 x, float alpha)\n\ {\n\ return x * -1;\n\ }\n\ \n\ -float4 eltwise_unary_hard_sigmoid(float4 x)\n\ +float4 eltwise_unary_hard_sigmoid(float4 x, float alpha)\n\ {\n\ x = 0.2 * x + 0.5;\n\ x = clamp(x, 0, 1);\n\ return x;\n\ }\n\ \n\ -float4 _softrelu(float4 x)\n\ +float4 _softrelu(float4 x, float alpha)\n\ {\n\ x *= logE;\n\ x = exp2(x);\n\ @@ -36201,7 +37804,7 @@ float4 _softrelu(float4 x)\n\ return x * rlogE;\n\ }\n\ \n\ -float4 _tanh(float4 x)\n\ +float4 _tanh(float4 x, float alpha)\n\ {\n\ x *= -twoLogE;\n\ x = 1 + exp2(x);\n\ @@ -36209,10 +37812,10 @@ float4 _tanh(float4 x)\n\ return (2 * x - 1);\n\ }\n\ \n\ -float4 eltwise_unary_mish(float4 x)\n\ +float4 eltwise_unary_mish(float4 x, float alpha)\n\ {\n\ - float4 y = _softrelu(x);\n\ - x = x * _tanh(y);\n\ + float4 y = _softrelu(x, alpha);\n\ + x = x * _tanh(y, alpha);\n\ return x;\n\ }\n\ \n\ @@ -36224,14 +37827,15 @@ __kernel void func_name##_F32toF32 \\\n\ float inputScale, \\\n\ float inputTail, \\\n\ float outputScale, \\\n\ - float outputZP \\\n\ + float outputZP, \\\n\ + float alpha \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ \\\n\ float4 src = read_imagef(input, coord); \\\n\ \\\n\ - float4 dst = eltwise_unary_##func_name(src); \\\n\ + float4 dst = eltwise_unary_##func_name(src, alpha); \\\n\ \\\n\ write_imagef(output, coord, dst); \\\n\ }\n\ @@ -36251,14 +37855,15 @@ __kernel void func_name##_F32toF32_2D \\\n\ float inputScale, \\\n\ float inputTail, \\\n\ float outputScale, \\\n\ - float outputZP \\\n\ + float outputZP, \\\n\ + float alpha \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ \\\n\ float4 src = read_imagef(input, coord); \\\n\ \\\n\ - float4 dst = eltwise_unary_##func_name(src); \\\n\ + float4 dst = eltwise_unary_##func_name(src, alpha); \\\n\ \\\n\ write_imagef(output, coord, dst); \\\n\ }\n\ @@ -36278,7 +37883,8 @@ __kernel void func_name##_U8toU8 \\\n\ float inputScale, \\\n\ float inputTail, \\\n\ float outputScale, \\\n\ - float outputZP \\\n\ + float outputZP, \\\n\ + float alpha \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ @@ -36286,7 +37892,7 @@ __kernel void func_name##_U8toU8 \\\n\ uint4 src = read_imageui(input, coord); \\\n\ float4 data = convert_float4(src) * inputScale - inputTail; \\\n\ \\\n\ - data = eltwise_unary_##func_name(data); \\\n\ + data = eltwise_unary_##func_name(data, alpha); \\\n\ uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ \\\n\ write_imageui(output, coord, dst); \\\n\ @@ -36307,7 +37913,8 @@ __kernel void func_name##_U8toU8_2D \\\n\ float inputScale, \\\n\ float inputTail, \\\n\ float outputScale, \\\n\ - float outputZP \\\n\ + float outputZP, \\\n\ + float alpha \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ @@ -36315,7 +37922,7 @@ __kernel void func_name##_U8toU8_2D \\\n\ uint4 src = read_imageui(input, coord); \\\n\ float4 data = convert_float4(src) * inputScale - inputTail; \\\n\ \\\n\ - data = eltwise_unary_##func_name(data); \\\n\ + data = eltwise_unary_##func_name(data, alpha); \\\n\ uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ \\\n\ write_imageui(output, coord, dst); \\\n\ @@ -36336,7 +37943,8 @@ __kernel void neg_I32toI32\n\ float inputScale,\n\ float inputTail,\n\ float outputScale,\n\ - float outputZP\n\ + float outputZP,\n\ + float alpha\n\ )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ @@ -36354,7 +37962,8 @@ __kernel void neg_I32toI32_2D\n\ float inputScale,\n\ float inputTail,\n\ float outputScale,\n\ - float outputZP\n\ + float outputZP,\n\ + float alpha\n\ )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ @@ -45352,6 +46961,103 @@ __kernel void relu_keras_U8toF32_2D(\n\ write_imagef(output, coord, dst);\n\ }"; /* end of relu_keras_cl*/ +static const char resize_1d_bilinear_cl[] = "__kernel void resize_1d_bilinear_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float half_pixel_value\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float left_x_f = floor(in_x);\n\ + float x_lerp = in_x - left_x_f;\n\ + int left_x_idx = convert_int(left_x_f);\n\ + int4 coord_in = (int4)(left_x_idx, coord_out.y, coord_out.z, 0);\n\ + float4 top_l, top_r, top, bottom, dst;\n\ +\n\ + top_l = read_imagef(input, coord_in);\n\ + coord_in.x++;\n\ + top_r = read_imagef(input, coord_in);\n\ +\n\ + top_r = top_r - top_l;\n\ + dst = top_l + x_lerp * top_r;\n\ +\n\ + write_imagef(output, coord_out, dst);\n\ +\n\ +}\n\ +\n\ +\n\ +__kernel void resize_1d_bilinear_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float half_pixel_value,\n\ + float in_scale,\n\ + float in_tail,\n\ + float out_scale,\n\ + float out_tail\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float left_x_f = floor(in_x);\n\ + float x_lerp = in_x - left_x_f;\n\ + int left_x_idx = convert_int(left_x_f);\n\ + int4 coord_in = (int4)(left_x_idx, coord_out.y, coord_out.z, 0);\n\ + float4 top_l, top_r, top;\n\ + uint4 dst;\n\ +\n\ + top_l = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail;\n\ + coord_in.x++;\n\ + top_r = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail;\n\ +\n\ + top_r = top_r - top_l;\n\ + top = top_l + x_lerp * top_r;\n\ + dst = convert_uint4(top * out_scale + out_tail);\n\ +\n\ + write_imageui(output, coord_out, dst);\n\ +}\n\ +"; /* end of resize_1d_bilinear_cl*/ + +static const char resize_1d_nearest_cl[] = "\n\ +#define NEAREST_INDEX_PROCESS() \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x + round_value; \\\n\ + int in_x_idx = convert_int(in_x); \\\n\ +\n\ +__kernel void resize_1d_nearest_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float half_pixel_value,\n\ + float round_value)\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ + int4 coord_in = (int4)(in_x_idx, coord_out.y, coord_out.z, 0);\n\ + float4 dst;\n\ + dst = read_imagef(input, coord_in);\n\ + write_imagef(output, coord_out, dst);\n\ +}\n\ +\n\ +\n\ +__kernel void resize_1d_nearest_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float half_pixel_value,\n\ + float round_value,\n\ + float output_scale,\n\ + float output_tail)\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ + int4 coord_in = (int4)(in_x_idx, coord_out.y, coord_out.z, 0);\n\ + uint4 dst;\n\ + dst = convert_uint4(convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail);\n\ + write_imageui(output, coord_out, dst);\n\ +}\n\ +"; /* end of resize_1d_nearest_cl*/ + static const char resize_bilinear_cl[] = "__kernel void resize_bilinear_F32toF32(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ @@ -46418,6 +48124,15 @@ static const source_map_t evis_resource[] = {"relational_ops_2d_vx", relational_ops_2d_vx}, {"relational_ops_3d_vx", relational_ops_3d_vx}, {"relu_keras_vx", relu_keras_vx}, + {"resize_1d_bilinear_BF16_vx", resize_1d_bilinear_BF16_vx}, + {"resize_1d_bilinear_DOWN_NX_vx", resize_1d_bilinear_DOWN_NX_vx}, + {"resize_1d_bilinear_F16_vx", resize_1d_bilinear_F16_vx}, + {"resize_1d_bilinear_I16_vx", resize_1d_bilinear_I16_vx}, + {"resize_1d_bilinear_I8_vx", resize_1d_bilinear_I8_vx}, + {"resize_1d_bilinear_U8_vx", resize_1d_bilinear_U8_vx}, + {"resize_1d_bilinear_U8_opt_vx", resize_1d_bilinear_U8_opt_vx}, + {"resize_1d_bilinear_UP_NX_vx", resize_1d_bilinear_UP_NX_vx}, + {"resize_1d_nearest_vx", resize_1d_nearest_vx}, {"resize_bilinear_BF16_vx", resize_bilinear_BF16_vx}, {"resize_bilinear_F16_vx", resize_bilinear_F16_vx}, {"resize_bilinear_I16_vx", resize_bilinear_I16_vx}, @@ -46552,6 +48267,8 @@ static const source_map_t cl_resource[] = {"reduceprod_internal_axis2_cl", reduceprod_internal_axis2_cl}, {"relational_ops_cl", relational_ops_cl}, {"relu_keras_cl", relu_keras_cl}, + {"resize_1d_bilinear_cl", resize_1d_bilinear_cl}, + {"resize_1d_nearest_cl", resize_1d_nearest_cl}, {"resize_bilinear_cl", resize_bilinear_cl}, {"resize_nearest_cl", resize_nearest_cl}, {"scatter_nd_cl", scatter_nd_cl}, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c index c760898..64b94ee 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c @@ -176,8 +176,8 @@ static vsi_status op_optimize reshape 3d input (xcn) --> 4d input (whcn) reshape 3d output(xcn) --> 4d output(whcn) */ - shape[0] = inputs[0]->attr.size[0]; - shape[1] = 1; + shape[0] = 1; + shape[1] = inputs[0]->attr.size[0]; shape[2] = inputs[0]->attr.size[1]; shape[3] = inputs[0]->attr.size[2]; dim = 4; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c index 86e6ae2..25a8787 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c @@ -136,6 +136,7 @@ static vsi_bool op_check IO_TYPE(D_I32, D_F32) IO_TYPE(D_I32, D_I32) IO_TYPE(D_I32, D_U32) + IO_TYPE(D_I32, D_F16) IO_TYPE(D_I32, D_BOOL8) IO_TYPE(D_U32, D_F32) IO_TYPE(D_U32, D_I32) @@ -176,6 +177,7 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_F32) IO_TYPE(D_U8|Q_ASYM, D_I32) IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_U8, D_F16) END_IO_TYPE_DECL(CAST) if(!VALIDATE_OP_IO_TYPES(CAST, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c index 6ee4172..d36cf41 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c @@ -37,6 +37,29 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" +static vsi_bool _enable_concat_optimize() +{ + char *envctrl; + static int32_t enableOptimize = -1; + + if (enableOptimize == -1) + { + enableOptimize = 1; + envctrl = getenv("VSI_NN_ENABLE_CONCAT_OPTIMIZE"); + if (envctrl) + { + enableOptimize = atoi(envctrl); + } + } + + if (enableOptimize == 1) + { + return TRUE; + } + + return FALSE; +} + static int32_t _get_input_num ( vsi_nn_node_t * self, @@ -243,7 +266,8 @@ static vsi_status op_compute status = VSI_SUCCESS; self->n = NULL; - if(_is_highest_dimension(self, outputs) && _is_same_quant(self, inputs, outputs)) + if(_is_highest_dimension(self, outputs) && _is_same_quant(self, inputs, outputs) + && _enable_concat_optimize()) { iter = self->nn_param.concat.lcl_data; while( NULL != iter ) @@ -397,7 +421,8 @@ static vsi_status op_optimize status = VSI_SUCCESS; /* we don't create tensor view if the axis is not the highest dimension */ if (_is_highest_dimension(self, outputs) == FALSE || - _is_same_quant(self, inputs, outputs) == FALSE) + _is_same_quant(self, inputs, outputs) == FALSE || + _enable_concat_optimize() == FALSE) { return status; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index 8d8de45..ca6b3db 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -194,6 +194,7 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_ASYM) IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c index 5cbb9d6..4835ab7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c @@ -31,6 +31,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" /* @@ -73,6 +74,7 @@ static vsi_status op_compute { status = VSI_SUCCESS; } + vsi_nn_kernel_param_release( ¶m ); return status; } /* op_compute() */ @@ -119,7 +121,7 @@ static vsi_bool op_setup VSI_NN_ROUND_FLOOR ); - outputs[0]->attr.size[1] = inputs[1]->attr.size[2]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1] * p->multiplier; outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c index c4192d3..c74da7a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c @@ -45,22 +45,30 @@ static vsi_status _eltwise_unary_op_compute ) { vsi_status status = VSI_FAILURE; + float alpha = 0; + vsi_nn_kernel_param_t * param = NULL; if( NULL == self ) { return status; } + param = vsi_nn_kernel_param_create(); + + alpha = self->nn_param.elu.alpha; + vsi_nn_kernel_param_add_float32( param, "alpha", alpha ); // TODO: This optimzie is a hack for gpu path, // it should be moved to gpu kernel setup. self->n = (vx_node)vsi_nn_kernel_selector( self->graph, - kernel_name, inputs, 1, outputs, 1, NULL ); + kernel_name, inputs, 1, outputs, 1, param ); if( self->n ) { status = VSI_SUCCESS; } + vsi_nn_kernel_param_release( ¶m ); + return status; } /* _eltwise_op_compute() */ @@ -152,6 +160,19 @@ static vsi_bool op_check return TRUE; } /* op_check() */ +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + if (vsi_nn_compareVersion(self->graph, 1, 1, 29) == -1) + { + self->nn_param.elu.alpha = 1; + } + + return VSI_SUCCESS; +} /* op_init() */ + #ifdef __cplusplus extern "C" { #endif @@ -166,7 +187,7 @@ extern "C" { { \ return _eltwise_unary_op_compute( ""#kernel_name, self, inputs, outputs ); \ } \ -DEF_OP_REG(name, NULL, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_check, op_setup, NULL, 2, 1) +DEF_OP_REG(name, op_init, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_check, op_setup, NULL, 2, 1) DEF_ELEMENT_WISE_UNARY_OP( SIN, sin ); DEF_ELEMENT_WISE_UNARY_OP( EXP, exp ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c index a0d0395..dbec838 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c @@ -73,6 +73,7 @@ static vsi_bool op_check IO_TYPE(D_I32, D_I32, D_I32, D_F16, D_I32) IO_TYPE(D_I32, D_I32, D_F32, D_F16, D_F32) IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I32, D_F32, D_F32, D_U8|Q_ASYM) END_IO_TYPE_DECL(HASHTABLE_LOOKUP) if (!VALIDATE_OP_IO_TYPES(HASHTABLE_LOOKUP, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c new file mode 100644 index 0000000..cc38677 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c @@ -0,0 +1,298 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_internal_compute_node( self ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_interp_param *p = NULL; + + p = &self->nn_param.interp; + + if ((p->pad_beg > 0) || (p->pad_end > 0)) + { + VSILOGE("Only supports non-pos padding (cropping) for now "); + return FALSE; + } + + return TRUE; +} /* op_check() */ + + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_interp_param *p = NULL; + int32_t height_in_eff_, width_in_eff_; + int32_t height_out, width_out; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t *crop_tensor = NULL; + vsi_nn_tensor_t *crop_in_tensor = NULL; + float factor = 1.0f; + int32_t pad_beg = 0; + int32_t pad_end = 0; + + if ( NULL == self ) + { + return FALSE; + } + + p = &self->nn_param.interp; + pad_beg = -p->pad_beg; + pad_end = -p->pad_end; + width_in_eff_ = inputs[0]->attr.size[0] + p->pad_beg + p->pad_end; + height_in_eff_ = inputs[0]->attr.size[1] + p->pad_beg + p->pad_end; + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + memcpy( outputs[0]->attr.size, inputs[0]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + if ((p->shrink_factor > 0) && (p->zoom_factor <= 0)) + { + width_out = (width_in_eff_ - 1) / p->shrink_factor + 1; + height_out = (height_in_eff_ - 1) / p->shrink_factor + 1; + } + else if ((p->zoom_factor > 0) && (p->shrink_factor <= 0)) + { + width_out = (width_in_eff_ - 1) * (p->zoom_factor - 1) + width_in_eff_; + height_out = (height_in_eff_ - 1) * (p->zoom_factor - 1) + height_in_eff_; + } + else if ((p->height > 0) && (p->width > 0)) + { + width_out = p->width; + height_out = p->height; + } + else if ((p->zoom_factor > 0) && (p->shrink_factor > 0)) + { + width_out = (width_in_eff_ - 1) / p->shrink_factor + 1; + height_out = (height_in_eff_ - 1) / p->shrink_factor + 1; + width_out = (width_out - 1) * (p->zoom_factor - 1) + width_out; + height_out = (height_out - 1) * (p->zoom_factor - 1) + height_out; + } + else if (NULL != inputs[1]) + { + width_out = inputs[1]->attr.size[0]; + height_out = inputs[1]->attr.size[1]; + } + else + { + VSILOGE("Not support params "); + return FALSE; + } + + if ((width_out < 0) || (height_out < 0) || (width_in_eff_ < 0) || (height_in_eff_ < 0)) + { + VSILOGE("value shoud be positive: width_out %d height_out %d width_in_eff_ %d height_in_eff_ %d ", + width_out, height_out, width_in_eff_, height_in_eff_); + return FALSE; + } + + outputs[0]->attr.size[0] = width_out; + outputs[0]->attr.size[1] = height_out; + } + + factor = (float)(outputs[0]->attr.size[0]) / (float)(width_in_eff_); + + if ((pad_beg > 0) || (pad_end > 0)) + { + vsi_nn_tensor_attr_t attr; + int32_t use_virtual_tensor = 1; + int32_t *begin_dims; + int32_t *end_dims; + int32_t *stride_dims; + uint32_t i; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor); + crop_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + crop_in_tensor = crop_tensor->t; + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 1, 1 ); + curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.begin_mask = 0; + curr->node->nn_param.strided_slice.end_mask = 0; + curr->node->nn_param.strided_slice.shrink_axis_mask = 0; + begin_dims = (int32_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + end_dims = (int32_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + stride_dims = (int32_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + stride_dims[i] = 1; + } + + begin_dims[0] = pad_beg; + begin_dims[1] = pad_beg; + end_dims[0] = inputs[0]->attr.size[0] - pad_end; + end_dims[1] = inputs[0]->attr.size[1] - pad_end; + + if (inputs[0]->attr.dim_num > 2) + { + for (i = 2 ; i < inputs[0]->attr.dim_num; i++) + { + begin_dims[i] = 0; + end_dims[i] = inputs[0]->attr.size[i]; + } + } + curr->node->nn_param.strided_slice.begin_dims = begin_dims; + curr->node->nn_param.strided_slice.end_dims = end_dims; + curr->node->nn_param.strided_slice.stride_dims = stride_dims; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = crop_in_tensor; + vsi_nn_internal_setup_node(self, curr); + } + else + { + crop_in_tensor = inputs[0]; + } + + if ((width_in_eff_ == (int32_t)outputs[0]->attr.size[0]) && (height_in_eff_ == (int32_t)outputs[0]->attr.size[1])) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 1, 1 ); + curr->inputs[0] = crop_in_tensor; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + } + else + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_INTERNAL, 1, 1 ); + curr->node->nn_param.resize_internal.align_corners = vx_true_e; + curr->node->nn_param.resize_internal.factor = factor; + curr->node->nn_param.resize_internal.half_pixel_centers = vx_false_e; + curr->inputs[0] = crop_in_tensor; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status; + + status = VSI_SUCCESS; + vsi_nn_internal_optimize_node( self, direction ); + + return status; +} /* op_optimize() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_internal_init_node_wksp(self); + self->nn_param.interp.height = 0; + self->nn_param.interp.width = 0; + self->nn_param.interp.pad_beg = 0; + self->nn_param.interp.pad_end = 0; + self->nn_param.interp.shrink_factor = 0; + self->nn_param.interp.zoom_factor = 0; + + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_deinit_node_wksp(self); + status = vsi_nn_op_common_deinit(self); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ INTERP, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c index cd0a9db..a1a825a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c @@ -155,11 +155,13 @@ static vsi_bool op_check BEGIN_IO_TYPE_DECL(PERMUTE, 1, 1) IO_TYPE(D_F16, D_F16) IO_TYPE(D_F16, D_F32) + IO_TYPE(D_I16, D_I16) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_SYM_PC, D_I8|Q_SYM_PC) IO_TYPE(D_BOOL8, D_BOOL8) IO_TYPE(D_BOOL8, D_I8|Q_DFP) IO_TYPE(D_F32, D_F32) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c new file mode 100644 index 0000000..235d0c3 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c @@ -0,0 +1,207 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_bool _is_same_shape + ( + vsi_nn_tensor_t * inputs, + uint32_t *sizes, + uint32_t dims + ) +{ + uint32_t i = 0; + + if (inputs->attr.dim_num != dims) + return FALSE; + + for (i = 0; i < dims; i++) + { + if (sizes[i] != inputs->attr.size[i]) + return FALSE; + } + + return TRUE; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + + status = vsi_nn_internal_compute_node( self ); + + return status; +} /* op_compute() */ + + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + if ( _is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num) ) + { + return vsi_nn_internal_optimize_node(self, direction ); + } + else + { + return VSI_SUCCESS; + } +} /* op_optimize() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + float factor = self->nn_param.resize_1d.factor; + vsi_nn_internal_node_t* curr = NULL; + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + if (factor != 0) + { + outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor); + } + else + { + outputs[0]->attr.size[0] = self->nn_param.resize_1d.size[0]; + } + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + + if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num)) + { + vsi_nn_internal_init_node_wksp( self ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + } + else if (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize_1d.type) + { + vsi_nn_internal_init_node_wksp( self ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_1D_BILINEAR_INTERNAL, 0, 0 ); + curr->node->nn_param.resize_1d_bilinear_internal.align_corners = self->nn_param.resize_1d.align_corners; + curr->node->nn_param.resize_1d_bilinear_internal.factor = self->nn_param.resize_1d.factor; + curr->node->nn_param.resize_1d_bilinear_internal.half_pixel_centers = \ + self->nn_param.resize_1d.half_pixel_centers; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + } + else if (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize_1d.type) + { + vsi_nn_internal_init_node_wksp( self ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_1D_NEAREST_INTERNAL, 0, 0 ); + curr->node->nn_param.resize_1d_nearest_internal.align_corners = self->nn_param.resize_1d.align_corners; + curr->node->nn_param.resize_1d_nearest_internal.factor = self->nn_param.resize_1d.factor; + curr->node->nn_param.resize_1d_nearest_internal.half_pixel_centers = \ + self->nn_param.resize_1d.half_pixel_centers; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + return VSI_SUCCESS; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_internal_deinit_node_wksp(self); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RESIZE_1D, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c new file mode 100644 index 0000000..66ea066 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c @@ -0,0 +1,171 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + int32_t align_corners = self->nn_param.resize_1d_bilinear_internal.align_corners; + int32_t half_pixel_centers = self->nn_param.resize_1d_bilinear_internal.half_pixel_centers; + vsi_nn_kernel_param_t * param = NULL; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "align_corners", align_corners ); + vsi_nn_kernel_param_add_int32( param, "half_pixel_centers", half_pixel_centers ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "resize_1d_bilinear", + &inputs[0], 1, + &outputs[0], 1, param ); + + if ( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(RESIZE_1D_BILINEAR_INTERNAL, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + END_IO_TYPE_DECL(RESIZE_1D_BILINEAR_INTERNAL) + if (!VALIDATE_OP_IO_TYPES(RESIZE_1D_BILINEAR_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + float factor = self->nn_param.resize_1d_bilinear_internal.factor; + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + if (factor != 0) + { + outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor); + } + else + { + outputs[0]->attr.size[0] = self->nn_param.resize_1d.size[0]; + } + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + + return VSI_SUCCESS; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_op_common_deinit(self); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RESIZE_1D_BILINEAR_INTERNAL, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c new file mode 100644 index 0000000..edddc1a --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c @@ -0,0 +1,170 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + int32_t align_corners = self->nn_param.resize_1d_nearest_internal.align_corners; + int32_t half_pixel_centers = self->nn_param.resize_1d_nearest_internal.half_pixel_centers; + vsi_nn_kernel_param_t * param = NULL; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "align_corners", align_corners ); + vsi_nn_kernel_param_add_int32( param, "half_pixel_centers", half_pixel_centers ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "resize_1d_nearest", + &inputs[0], 1, + &outputs[0], 1, param ); + + if ( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(RESIZE_1D_NEAREST_INTERNAL, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + END_IO_TYPE_DECL(RESIZE_1D_NEAREST_INTERNAL) + if (!VALIDATE_OP_IO_TYPES(RESIZE_1D_NEAREST_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + float factor = self->nn_param.resize_1d_nearest_internal.factor; + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + if (factor != 0) + { + outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor); + } + else + { + outputs[0]->attr.size[0] = self->nn_param.resize_1d.size[0]; + } + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + + return VSI_SUCCESS; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_op_common_deinit(self); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RESIZE_1D_NEAREST_INTERNAL, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c index e55d1f8..94fa617 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c @@ -163,12 +163,17 @@ static vsi_bool op_check ret = FALSE; } + if(ret) { BEGIN_IO_TYPE_DECL(SVDF, 5, 2) IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16, D_F16, D_F16) IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16, D_F16) IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_F32, D_F32, D_F16) IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16, D_F16, D_NONE) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16, D_NONE) + IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_F32, D_F32, D_NONE) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32, D_NONE) END_IO_TYPE_DECL(SVDF) if(!VALIDATE_OP_IO_TYPES(SVDF, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index 4a898bc..518b099 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -117,8 +117,7 @@ static void _try_pack_tensor_data { *p_sz = (uint64_t)bytes; } - free( data ); - data = NULL; + vsi_nn_safe_free( data ); } } } /* _pack_tensor_data() */ @@ -417,6 +416,8 @@ static _op_param_gen_t s_op_gen[] = /* PRE_PROCESS_NV12 */ NULL, /* SCATTER_ND */ NULL, /* DECONVOLUTION1D */ NULL, + /* INTERP */ NULL, + /* RESIZE_1D */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c b/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c index b40e755..febd192 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c @@ -149,7 +149,7 @@ vsi_nn_tensor_t* vsi_nn_Concat src = j; memcpy( &buffer[dst * type_bytes], &tmp[src * type_bytes], type_bytes ); } - free(tmp); + vsi_nn_safe_free( tmp ); offset += strides[axis] * tensors[i]->attr.size[axis]; } tensor_out = vsi_nn_CreateTensorFromData( graph, buffer, &output_attr ); @@ -221,11 +221,7 @@ vsi_nn_tensor_t* vsi_nn_ConvertTensorDtype } } - if( src_buf ) - { - free( src_buf ); - src_buf = NULL; - } + vsi_nn_safe_free( src_buf ); if( dst_buf ) { free( dst_buf ); @@ -333,10 +329,7 @@ vsi_nn_tensor_t* vsi_nn_TensorAdd error: for ( i = 0; i < tensor_num; i++ ) { - if ( buffer[i] ) - { - free(buffer[i]); - } + vsi_nn_safe_free( buffer[i] ); } if( tmp ) { diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c index 1482fc1..9ac442d 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -710,33 +710,6 @@ vsi_bool vsi_nn_CheckFilePath return FALSE; } /* vsi_nn_CheckFilePath() */ -void vsi_nn_GetFP32MultiAndPostShift - ( - vx_float32 mult, - vx_uint16 *M0, - vx_int8 *N - ) -{ - vx_uint32 uintMult = *((vx_uint32*)(&mult)); - vx_uint32 tmpMultiply = 0; - vx_int32 exp = 0; - vx_uint32 postShiftBit6to5 = 0; - vx_uint32 postShift = 0; - vx_int8 tmpPostShift = 0; - - tmpMultiply = (uintMult & 0x7FFFFF) >> 8; - *M0 = (vx_uint16)((1U << 15) + tmpMultiply); - - exp = (uintMult & 0x7F800000) >> 23; /* postShift is Scale's exp*/ - tmpPostShift = 15 - ((vx_int8)exp - 127); - postShift = tmpPostShift & 0x1F; - tmpPostShift = tmpPostShift >> 5; - postShiftBit6to5 = tmpPostShift & 3; - - *N = (vx_int8)(((postShiftBit6to5 << 5) | (postShift & 0x1F))); - *N = (((vx_int32)*N << 25) >> 25); -}/* vsi_nn_GetFP32MultiAndPostShift() */ - typedef struct { uint8_t* raw_addr; diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c index 9732a50..1cde801 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c +++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c @@ -520,6 +520,7 @@ static vx_tensor _create_const_raw_tensor vx_tensor tensor = NULL; vx_tensor_create_params_t params; float * scales = NULL; + int32_t * zeroPoints = NULL; memset( ¶ms, 0, sizeof( vx_tensor_create_params_t ) ); params.num_of_dims = attr.dim_num; @@ -539,12 +540,14 @@ static vx_tensor _create_const_raw_tensor #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT // This is a hack that driver doesn't support const scale scales = (float *)malloc(sizeof(float) * attr.dtype.scale_dim); + zeroPoints = (int32_t *)malloc(sizeof(int32_t) * attr.dtype.zero_points_dim); memcpy(scales, attr.dtype.scales, attr.dtype.scale_dim * sizeof(float)); + memcpy(zeroPoints, attr.dtype.zero_points, attr.dtype.zero_points_dim * sizeof(float)); params.quant_data.affinePerChannel.channelDim = attr.dtype.channel_dim; params.quant_data.affinePerChannel.scaleCount = attr.dtype.scale_dim; params.quant_data.affinePerChannel.scales = scales; - params.quant_data.affinePerChannel.zeroPoint = NULL; - params.quant_data.affinePerChannel.zeroPointCount = 0; + params.quant_data.affinePerChannel.zeroPoint = zeroPoints; + params.quant_data.affinePerChannel.zeroPointCount = attr.dtype.zero_points_dim; break; #else VSILOGE( "can't support qnt_type VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC." ); @@ -580,6 +583,10 @@ static vx_tensor _create_const_raw_tensor { free( scales ); } + if (zeroPoints) + { + free( zeroPoints ); + } return NULL; } } @@ -620,6 +627,10 @@ static vx_tensor _create_const_raw_tensor { free( scales ); } + if (zeroPoints) + { + free( zeroPoints ); + } return tensor; } /* _create_const_raw_tensor() */ @@ -689,6 +700,8 @@ static void _convert_const_I8toU8 if ( tensor->t ) vxReleaseTensor(&tensor->t); tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr); + + vsi_nn_safe_free( data ); }/* _convert_const_I8toU8() */ static vsi_status _convert_graph_const_tensor diff --git a/src/tim/vx/internal/src/vsi_nn_log.c b/src/tim/vx/internal/src/vsi_nn_log.c index c818463..9ee1114 100644 --- a/src/tim/vx/internal/src/vsi_nn_log.c +++ b/src/tim/vx/internal/src/vsi_nn_log.c @@ -29,25 +29,49 @@ #include "vsi_nn_log.h" #include "vsi_nn_types.h" +#ifdef __ANDROID__ +#if ANDROID_SDK_VERSION >= 30 +static const char* ENV_LOG_LEVEL = "vendor.VSI_NN_LOG_LEVEL"; +#else +static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL"; +#endif +#else +static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL"; +#endif + +int get_env_as_int(const char* env, int default_value) { + + int value = default_value; + #ifdef __ANDROID__ + { + char value_str[100]; + int status = __system_property_get(env, value_str); + if (status) { + value = atoi(value_str); + } + } + #else + { + char* env_s = getenv(env); + if (env_s) { + value = atoi(env_s); + } + } + #endif + + return value; +} + static vsi_bool _check_log_level ( vsi_nn_log_level_e level ) { - char *env_level_s; static vsi_nn_log_level_e env_level = VSI_NN_LOG_UNINIT; if(env_level == VSI_NN_LOG_UNINIT) { - env_level_s = getenv("VSI_NN_LOG_LEVEL"); - if(env_level_s) - { - env_level = (vsi_nn_log_level_e)atoi(env_level_s); - } - else - { - env_level = VSI_NN_LOG_WARN; - } + env_level = (vsi_nn_log_level_e)get_env_as_int(ENV_LOG_LEVEL, VSI_NN_LOG_WARN); } if(env_level >= level) diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index d9d0158..0af8be5 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -844,7 +844,7 @@ float * vsi_nn_ConvertTensorToFloat32Data if( !tensor->attr.is_created_from_handle ) { - if(tensor_data)free(tensor_data); + vsi_nn_safe_free( tensor_data ); } return data; } /* vsi_nn_ConvertTensorToFloat32Data() */ @@ -1095,7 +1095,7 @@ void vsi_nn_SaveTensorToTextByFp32 } fwrite( buf, count, 1, fp ); fclose( fp ); - free( data ); + vsi_nn_safe_free( data ); } /* vsi_nn_SaveTensorToTextByFp32() */ void vsi_nn_SaveTensorToText @@ -1124,7 +1124,7 @@ void vsi_nn_SaveTensorToText sz = vsi_nn_GetElementNum( tensor ); vsi_nn_SaveDataToText( filename, data, sz, tensor->attr.dtype.vx_type, seperator ); - free( data ); + vsi_nn_safe_free( data ); } /* vsi_nn_SaveTensorToText() */ void vsi_nn_SaveDataToText @@ -1219,7 +1219,7 @@ void vsi_nn_SaveTensorToBinary } fwrite( data, sz, 1, fp ); fclose( fp ); - free( data ); + vsi_nn_safe_free( data ); } /* vsi_nn_SaveTensorToBinary() */ vsi_nn_tensor_t * vsi_nn_CreateTensorFromData @@ -1539,7 +1539,7 @@ void vsi_nn_TransposeTensor VSILOGE( "Copy transpose data fail with code %#x.", status ); } - free( buf ); + vsi_nn_safe_free( buf ); free( dst ); } /* vsi_nn_TransposeTensor() */ @@ -1588,7 +1588,7 @@ void vsi_nn_PermuteTensor if( perm[i] >= dim_num ) { VSILOGW( "Incorrect perm %d", perm[i] ); - if( buf ) { free(buf); buf = NULL; } + vsi_nn_safe_free( buf ); if( dst ) { free(dst); dst = NULL; } return; } @@ -1603,7 +1603,7 @@ void vsi_nn_PermuteTensor VSILOGE( "Copy permute data fail with code %#x.", status ); } - if( buf ) { free(buf); buf = NULL; } + vsi_nn_safe_free( buf ); if( dst ) { free(dst); dst = NULL; } } /* vsi_nn_PermuteTensor() */ @@ -2241,7 +2241,7 @@ void vsi_nn_reshuffle_weight_data } vsi_nn_CopyDataToTensor( graph, weights, weight_data ); vsi_nn_Free( buffer ); - vsi_nn_Free( weight_data ); + vsi_nn_safe_free( weight_data ); } vsi_nn_tensor_t* vsi_nn_ConcatTensor_impl