Update internal to REL/v1.1.30

Commit: 6ccb425e Signed-off-by: Jiang Bo <bo.jiang@verisilicon.com>
2021-02-26 14:01:35 +08:00 · 2021-02-26 14:01:35 +08:00 · def53f4b5c
parent 62898a4419
commit def53f4b5c
77 changed files with 8294 additions and 225 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,335 @@
 ## Ignore Visual Studio temporary files, build results, and
 ## files generated by popular Visual Studio add-ons.
 ##
 ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
 # User-specific files
 *.suo
 *.user
 *.userosscache
 *.sln.docstates
 *-[Dd]ebug/
 *-[Dd]ebugPublic/
 *-[Rr]elease/
 *-[Rr]eleases/
 # User-specific files (MonoDevelop/Xamarin Studio)
 *.userprefs
 # Build results
 *.o
 [Dd]ebug/
 [Dd]ebugPublic/
 [Rr]elease/
 [Rr]eleases/
 x64/
 x86/
 bld/
 [Bb]in/
 [Oo]bj/
 [Ll]og/
 NNApi0.3/
 NNApi0.4/
 OpenVX1.2/
 bazel-bin
 bazel-genfiles
 bazel-out
 bazel-ovxlib
 bazel-testlogs
 # Visual Studio 2015/2017 cache/options directory
 .vs/
 # Uncomment if you have tasks that create the project's static files in wwwroot
 #wwwroot/
 # VS code
 .vscode
 # Visual Studio 2017 auto generated files
 Generated\ Files/
 # MSTest test Results
 [Tt]est[Rr]esult*/
 [Bb]uild[Ll]og.*
 # NUNIT
 *.VisualState.xml
 TestResult.xml
 # Build Results of an ATL Project
 [Dd]ebugPS/
 [Rr]eleasePS/
 dlldata.c
 # Benchmark Results
 BenchmarkDotNet.Artifacts/
 # .NET Core
 project.lock.json
 project.fragment.lock.json
 artifacts/
 **/Properties/launchSettings.json
 # StyleCop
 StyleCopReport.xml
 # Files built by Visual Studio
 *_i.c
 *_p.c
 *_i.h
 *.ilk
 *.meta
 *.obj
 *.pch
 *.pdb
 *.pgc
 *.pgd
 *.rsp
 *.sbr
 *.tlb
 *.tli
 *.tlh
 *.tmp
 *.tmp_proj
 *.log
 *.vspscc
 *.vssscc
 .builds
 *.pidb
 *.svclog
 *.scc
 # Chutzpah Test files
 _Chutzpah*
 # Visual C++ cache files
 ipch/
 *.aps
 *.ncb
 *.opendb
 *.opensdf
 *.sdf
 *.cachefile
 *.VC.db
 *.VC.VC.opendb
 # Visual Studio profiler
 *.psess
 *.vsp
 *.vspx
 *.sap
 # Visual Studio Trace Files
 *.e2e
 # TFS 2012 Local Workspace
 $tf/
 # Guidance Automation Toolkit
 *.gpState
 # ReSharper is a .NET coding add-in
 _ReSharper*/
 *.[Rr]e[Ss]harper
 *.DotSettings.user
 # JustCode is a .NET coding add-in
 .JustCode
 # TeamCity is a build add-in
 _TeamCity*
 # DotCover is a Code Coverage Tool
 *.dotCover
 # AxoCover is a Code Coverage Tool
 .axoCover/*
 !.axoCover/settings.json
 # Visual Studio code coverage results
 *.coverage
 *.coveragexml
 # NCrunch
 _NCrunch_*
 .*crunch*.local.xml
 nCrunchTemp_*
 # MightyMoose
 *.mm.*
 AutoTest.Net/
 # Web workbench (sass)
 .sass-cache/
 # Installshield output folder
 [Ee]xpress/
 # DocProject is a documentation generator add-in
 DocProject/buildhelp/
 DocProject/Help/*.HxT
 DocProject/Help/*.HxC
 DocProject/Help/*.hhc
 DocProject/Help/*.hhk
 DocProject/Help/*.hhp
 DocProject/Help/Html2
 DocProject/Help/html
 # Click-Once directory
 publish/
 # Publish Web Output
 *.[Pp]ublish.xml
 *.azurePubxml
 # Note: Comment the next line if you want to checkin your web deploy settings,
 # but database connection strings (with potential passwords) will be unencrypted
 *.pubxml
 *.publishproj
 # Microsoft Azure Web App publish settings. Comment the next line if you want to
 # checkin your Azure Web App publish settings, but sensitive information contained
 # in these scripts will be unencrypted
 PublishScripts/
 # NuGet Packages
 *.nupkg
 # The packages folder can be ignored because of Package Restore
 **/[Pp]ackages/*
 # except build/, which is used as an MSBuild target.
 !**/[Pp]ackages/build/
 # Uncomment if necessary however generally it will be regenerated when needed
 #!**/[Pp]ackages/repositories.config
 # NuGet v3's project.json files produces more ignorable files
 *.nuget.props
 *.nuget.targets
 # Microsoft Azure Build Output
 csx/
 *.build.csdef
 # Microsoft Azure Emulator
 ecf/
 rcf/
 # Windows Store app package directories and files
 AppPackages/
 BundleArtifacts/
 Package.StoreAssociation.xml
 _pkginfo.txt
 *.appx
 # Visual Studio cache files
 # files ending in .cache can be ignored
 *.[Cc]ache
 # but keep track of directories ending in .cache
 !*.[Cc]ache/
 # Others
 ClientBin/
 ~$*
 *~
 *.dbmdl
 *.dbproj.schemaview
 *.jfm
 *.pfx
 *.publishsettings
 orleans.codegen.cs
 # Since there are multiple workflows, uncomment next line to ignore bower_components
 # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
 #bower_components/
 # RIA/Silverlight projects
 Generated_Code/
 # Backup & report files from converting an old project file
 # to a newer Visual Studio version. Backup files are not needed,
 # because we have git ;-)
 _UpgradeReport_Files/
 Backup*/
 UpgradeLog*.XML
 UpgradeLog*.htm
 # SQL Server files
 *.mdf
 *.ldf
 *.ndf
 # Business Intelligence projects
 *.rdl.data
 *.bim.layout
 *.bim_*.settings
 # Microsoft Fakes
 FakesAssemblies/
 # GhostDoc plugin setting file
 *.GhostDoc.xml
 # Node.js Tools for Visual Studio
 .ntvs_analysis.dat
 node_modules/
 # TypeScript v1 declaration files
 typings/
 # Visual Studio 6 build log
 *.plg
 # Visual Studio 6 workspace options file
 *.opt
 # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
 *.vbw
 # Visual Studio LightSwitch build output
 **/*.HTMLClient/GeneratedArtifacts
 **/*.DesktopClient/GeneratedArtifacts
 **/*.DesktopClient/ModelManifest.xml
 **/*.Server/GeneratedArtifacts
 **/*.Server/ModelManifest.xml
 _Pvt_Extensions
 # Paket dependency manager
 .paket/paket.exe
 paket-files/
 # FAKE - F# Make
 .fake/
 # JetBrains Rider
 .idea/
 *.sln.iml
 # CodeRush
 .cr/
 # Python Tools for Visual Studio (PTVS)
 __pycache__/
 *.pyc
 # Cake - Uncomment if you are using it
 # tools/**
 # !tools/packages.config
 # Tabs Studio
 *.tss
 # Telerik's JustMock configuration file
 *.jmconfig
 # BizTalk build output
 *.btp.cs
 *.btm.cs
 *.odx.cs
 *.xsd.cs
 # OpenCover UI analysis results
 OpenCover/
 # Azure Stream Analytics local run output 
 ASALocalRun/
 # IDE
 .settings/
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -144,3 +144,5 @@ DEF_OP(PRE_PROCESS_YUV444)
 DEF_OP(PRE_PROCESS_NV12)
 DEF_OP(SCATTER_ND)
 DEF_OP(DECONVOLUTION1D)
 DEF_OP(INTERP)
 DEF_OP(RESIZE_1D)
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@ -14,3 +14,5 @@ DEF_OP(RESIZE_NEAREST_INTERNAL)
 DEF_OP(DEPTH2SPACE_INTERNAL)
 DEF_OP(GRUCELL_ACTIVATION_INTERNAL)
 DEF_OP(GRUCELL_ACTIVATION_INTERNAL_SMA)
 DEF_OP(RESIZE_1D_BILINEAR_INTERNAL)
 DEF_OP(RESIZE_1D_NEAREST_INTERNAL)
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_elu.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_elu.h
@ -44,6 +44,7 @@ typedef struct _vsi_nn_elu_param
 {
    /* elu layer local data structure */
    vsi_nn_elu_lcl_data local;
    float  alpha;
 } vsi_nn_elu_param;
 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h
@ -0,0 +1,44 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_INTERP_H
 #define _VSI_NN_OP_INTERP_H
 #include "vsi_nn_types.h"
 typedef struct _vsi_nn_interp_param
 {
    struct _interp_local_data_t* local;
    int32_t   height;  //height of output
    int32_t   width;   //width of output
    int32_t   zoom_factor; // zoom factor
    int32_t   shrink_factor; // shrink factor
    int32_t   pad_beg; //padding at begin of input
    int32_t   pad_end; //padding at end of intput
 } vsi_nn_interp_param;
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h
@ -0,0 +1,44 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_RESIZE_1D_H
 #define _VSI_NN_OP_RESIZE_1D_H
 #include "vsi_nn_types.h"
 typedef struct _vsi_nn_resize_1d_param
 {
    struct _resize_1d_local_data_t* local;
    vsi_enum     type;
    float        factor;
    int32_t      size[2];
    vsi_bool    align_corners;
    vsi_bool    half_pixel_centers;
 } vsi_nn_resize_1d_param;
 _compiler_assert(offsetof(vsi_nn_resize_1d_param, local) == 0, \
    vsi_nn_resize_1d_h );
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h
@ -0,0 +1,42 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_RESIZE_1D_BILINEAR_INTERNAL_H
 #define _VSI_NN_OP_RESIZE_1D_BILINEAR_INTERNAL_H
 #include "vsi_nn_types.h"
 typedef struct _vsi_nn_resize_1d_bilinear_internal_param
 {
    struct _resize_1d_bilinear_internal_local_data_t* local;
    vsi_bool    align_corners;
    vsi_bool    half_pixel_centers;
    float        factor;
 } vsi_nn_resize_1d_bilinear_internal_param;
 _compiler_assert(offsetof(vsi_nn_resize_1d_bilinear_internal_param, local) == 0, \
    vsi_nn_resize_1d_bilinear_internal_h );
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h
@ -0,0 +1,42 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_RESIZE_1D_NEAREST_INTERNAL_H
 #define _VSI_NN_OP_RESIZE_1D_NEAREST_INTERNAL_H
 #include "vsi_nn_types.h"
 typedef struct _vsi_nn_resize_1d_nearest_internal_param
 {
    struct _resize_1d_nearest_internal_local_data_t* local;
    vsi_bool    align_corners;
    vsi_bool    half_pixel_centers;
    float        factor;
 } vsi_nn_resize_1d_nearest_internal_param;
 _compiler_assert(offsetof(vsi_nn_resize_1d_nearest_internal_param, local) == 0, \
    vsi_nn_resize_1d_nearest_internal_h );
 #endif
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@ -195,13 +195,6 @@ OVXLIB_API vsi_bool vsi_nn_CheckFilePath
    const char *path
    );
 OVXLIB_API void vsi_nn_GetFP32MultiAndPostShift
    (
    vx_float32 mult,
    vx_uint16 *M0,
    vx_int8 *N
    );
 /**
 * Malloc aligned buffer
 * Malloc address and size aligned buffer.
--- a/src/tim/vx/internal/include/vsi_nn_graph_optimization.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph_optimization.h
@ -32,6 +32,13 @@
 extern "C" {
 #endif
 vx_tensor vsi_nn_CreateRawTensorFromData
    (
    vsi_nn_graph_t       * graph,
    uint8_t             * data,
    vsi_nn_tensor_attr_t * attr
    );
 vsi_status vsi_nn_OptimizeGraph
    (
    vsi_nn_graph_t* graph,
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -158,6 +158,10 @@
 #include "ops/vsi_nn_op_squeeze.h"
 #include "ops/vsi_nn_op_expand_broadcast.h"
 #include "ops/vsi_nn_op_deconvolution1d.h"
 #include "ops/vsi_nn_op_interp.h"
 #include "ops/vsi_nn_op_resize_1d.h"
 #include "ops/vsi_nn_op_resize_1d_bilinear_internal.h"
 #include "ops/vsi_nn_op_resize_1d_nearest_internal.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
@ -302,6 +306,10 @@ typedef union _vsi_nn_nn_param
    vsi_nn_squeeze_param            squeeze;
    vsi_nn_expand_broadcast_param   expand_broadcast;
    vsi_nn_deconvolution1d_param    deconvolution1d;
    vsi_nn_interp_param             interp;
    vsi_nn_resize_1d_param          resize_1d;
    vsi_nn_resize_1d_bilinear_internal_param resize_1d_bilinear_internal;
    vsi_nn_resize_1d_nearest_internal_param resize_1d_nearest_internal;
    uint8_t                         client_param[128];
    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 28
+#define VSI_NN_VERSION_PATCH 30
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
--- a/src/tim/vx/internal/src/Android.mk
+++ b/src/tim/vx/internal/src/Android.mk
@ -117,6 +117,7 @@ LOCAL_C_INCLUDES += \
 LOCAL_CFLAGS :=  \
    -DLINUX \
    -D'OVXLIB_API=__attribute__((visibility("default")))' \
    -DANDROID_SDK_VERSION=$(PLATFORM_SDK_VERSION)\
        -Wno-sign-compare \
        -Wno-implicit-function-declaration \
        -Wno-sometimes-uninitialized \
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@ -168,12 +168,14 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define SCALAR_INPUT_SCALE           (2)
 #define SCALAR_INPUT_TAIL            (3)
 #define SCALAR_OUTPUT_SCALE          (4)
 #define SCALAR_OUTPUT_ZP             (5)
 #define SCALAR_ALPHA                 (6)
 #define _CL_PARAM_NUM          _cnt_of_array(kernel_param_def)
 /*
@ -293,6 +295,7 @@ static vsi_nn_kernel_node_t _setup
    float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
    float outputScale = outputs[0]->attr.dtype.scale;
    float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
    float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
    ret = vsi_nn_kernel_optimize_element_shape(
            (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num,
@ -331,6 +334,8 @@ static vsi_nn_kernel_node_t _setup
                    graph, F32, &outputScale );
            node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
                    graph, F32, &outputZP );
            node_params[SCALAR_ALPHA] = vsi_nn_kernel_scalar_create(
                    graph, F32, &alpha );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
@ -369,6 +374,11 @@ OnError:
        vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
    }
    if (node_params[SCALAR_ALPHA])
    {
        vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALPHA] );
    }
    return node;
 } /* _setup() */
--- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
@ -356,7 +356,8 @@ static vsi_nn_kernel_node_t _setup
    int32_t  out_shape[VSI_NN_MAX_DIM_NUM] = {0};
    int32_t  out_rs_flg = 0;
    int32_t axis_num  = 0;
-    int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", (size_t*)&axis_num);
+    size_t axis_num_temp = 0;
    int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num_temp);
    int32_t keep_dim  = vsi_nn_kernel_param_get_int32( params, "keep_dim" );
    int32_t first_axis = axis[0];
    int32_t i = 0;
@ -369,6 +370,8 @@ static vsi_nn_kernel_node_t _setup
    float input_scale = inputs[0]->attr.dtype.scale;
    float dim_ratio = (float)1.0 / (float)(width * height);
    axis_num = (int32_t)axis_num_temp;
    if(inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
    {
        if (inputs[0]->attr.dtype.fl > 0)
--- a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c
@ -0,0 +1,305 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"
 __BEGIN_DECLS
 #define _RESIZE_1D_BILINEAR_KERNEL_SOURCE()      "resize_1d_bilinear"
 #define STR(a) #a
 // Add kernel hashtable here
 #define RESIZE_1D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
        (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) )
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
        { RESIZE_1D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
          CVIVANTE_NAMESPACE("cl.resize_1d_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
          _RESIZE_1D_BILINEAR_KERNEL_SOURCE() }
 typedef struct
 {
    uint32_t key;
    char * function_name;
    const char * source_name;
 } _kernel_map_type;
 static const _kernel_map_type _resize_1d_bilinear_kernel_map[] =
 {
    // Register kernel here
    PACK_KERNEL_MAP( F32, F32),
    PACK_KERNEL_MAP( U8,  U8),
 };
 /*
 * Kernel params
 */
 static vx_param_description_t _resize_1d_bilinear_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _RESIZE_1D_BILINEAR_PARAM_NUM  _cnt_of_array( _resize_1d_bilinear_kernel_param_def )
 #define SCALAR_SCALE_X         (2)
 #define SCALAR_HALF_PIXEL      (3)
 #define SCALAR_INPUT_SCALE     (4)
 #define SCALAR_INPUT_TAIL      (5)
 #define SCALAR_OUTPUT_SCALE    (6)
 #define SCALAR_OUTPUT_TAIL     (7)
 #define RESIZE_1D_BILINEAR_NUM         4
 #define RESIZE_1D_BILINEAR_QUANT_NUM   _cnt_of_array( _resize_1d_bilinear_kernel_param_def )
 /*
 * Kernel initializer
 */
 DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
    vsi_status status = VSI_FAILURE;
    gpu_param_t gpu_param = {
        3,
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0}
        };
    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
    vsi_int_array_t * out_shape                 = NULL;
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
    out_shape  = output_attr->shape;
    gpu_param.global_scale[0]  = 1;
    gpu_param.global_scale[1]  = 1;
    gpu_param.global_scale[2]  = 1;
    gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
    gpu_param.global_size[0] = gpu_align_p2(
            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
            / gpu_param.global_scale[0], 4);
    gpu_param.global_size[1] = (
            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
            / gpu_param.global_scale[1]);
    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
    SAFE_FREE_TENSOR_ATTR(output_attr);
    return status;
 } /* _resize_1d_bilinear_initializer() */
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
    vsi_bool *is_use_u8_kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_dtype_e in_dtype  = F16;
    vsi_nn_kernel_dtype_e out_dtype = F16;
    const _kernel_map_type * kernel_map = _resize_1d_bilinear_kernel_map;
    size_t kernel_map_size              = _cnt_of_array( _resize_1d_bilinear_kernel_map );
    vx_param_description_t * param_def  = _resize_1d_bilinear_kernel_param_def;
    size_t param_def_size               = _cnt_of_array( _resize_1d_bilinear_kernel_param_def );
    vx_kernel_initialize_f  initializer = _resize_1d_bilinear_initializer;
    uint32_t key = 0;
    uint32_t i   = 0;
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    if (F16 == in_dtype)
    {
        in_dtype = F32;
    }
    if (F16 == out_dtype)
    {
        out_dtype = F32;
    }
    if ((U8 == in_dtype) || (U8 == out_dtype))
    {
        param_def_size = RESIZE_1D_BILINEAR_QUANT_NUM;
        *is_use_u8_kernel = TRUE;
    }
    else
    {
        param_def_size = RESIZE_1D_BILINEAR_NUM;
        *is_use_u8_kernel = FALSE;
    }
    key = RESIZE_1D_BILINEAR_HASH_KEY( in_dtype, out_dtype );
    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if ( kernel_map[i].key == key )
        {
            break;
        }
    }
    if ( i < (uint32_t)kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
        kernel->info.numParams   = (uint32_t)param_def_size;
        kernel->info.initialize  = initializer;
        // Register code source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
                kernel_map[i].source_name );
        // Register binary source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
                kernel_map[i].source_name );
        status = VSI_SUCCESS;
    }
    return status;
 } /* _query_kernel() */
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[RESIZE_1D_BILINEAR_QUANT_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
    int32_t in_width     = inputs[0]->attr.size[0];
    int32_t out_width    = outputs[0]->attr.size[0];
    float   input_zp     = (float)inputs[0]->attr.dtype.zero_point;
    float   input_scale  = inputs[0]->attr.dtype.scale;
    float   input_tail   = -(input_zp * input_scale);
    float   output_zp    = (float)outputs[0]->attr.dtype.zero_point;
    float   output_scale = (0 == outputs[0]->attr.dtype.scale) ? 1.0f : 1.0f / outputs[0]->attr.dtype.scale;
    float   half_pixel_value = 0.0f;
    float   scale_factor_x = 0.0f;
    vsi_bool is_use_u8_kernel = FALSE;
    if (align_corners && out_width > 1)
    {
        scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
    }
    else
    {
        scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
    }
    if (half_pixel_centers)
    {
        half_pixel_value = 0.5f;
    }
    else
    {
        half_pixel_value = 0.0f;
    }
    status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            size_t node_params_num = RESIZE_1D_BILINEAR_NUM;
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, RESIZE_1D_BILINEAR_QUANT_NUM,
                    inputs, input_num, outputs, output_num );
            node_params[SCALAR_SCALE_X]    = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x );
            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value );
            if (is_use_u8_kernel)
            {
                node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
                node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input_tail );
                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &output_zp );
                node_params_num = RESIZE_1D_BILINEAR_QUANT_NUM;
            }
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
            VSI_ASSERT( status == VSI_SUCCESS );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
            if (is_use_u8_kernel)
            {
                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
            }
        }
    }
    return node;
 } /* _setup() */
 __END_DECLS
 REGISTER_BACKEND_CL( resize_1d_bilinear, _setup )
--- a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c
@ -0,0 +1,312 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"
 __BEGIN_DECLS
 /*
 * Define kernel meta.
 */
 typedef enum
 {
    INTERNAL_KERNEL_RESIZE_1D_NEAREST,
 } _internal_kernel_e;
 #define _RESIZE_1D_NEAREST_KERNEL_SOURCE      "resize_1d_nearest"
 #define STR(a) #a
 // Add kernel hashtable here
 #define RESIZE_1D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
        { RESIZE_1D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
          CVIVANTE_NAMESPACE("cl.resize_1d_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
          _RESIZE_1D_NEAREST_KERNEL_SOURCE }
 typedef struct
 {
    uint32_t key;
    char * function_name;
    const char * source_name;
 } _kernel_map_type;
 static const _kernel_map_type _resize_1d_nearest_kernel_map[] =
 {
    // Register kernel here
    PACK_KERNEL_MAP( F32, F32),
    PACK_KERNEL_MAP( U8,  U8),
 };
 /*
 * Kernel params
 */
 static vx_param_description_t _resize_1d_nearest_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _RESIZE_1D_NEAREST_PARAM_NUM  5
 #define _RESIZE_1D_NEAREST_QUANT_NUM  _cnt_of_array( _resize_1d_nearest_kernel_param_def )
 #define SCALAR_SCALE_X         (2)
 #define SCALAR_HALF_PIXEL      (3)
 #define SCALAR_ROUND_VALUE     (4)
 #define SCALAR_SCALE_VALUE     (5)
 #define SCALAR_TAIL_VALUE      (6)
 /*
 * Kernel initializer
 */
 DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
    vsi_status status = VSI_FAILURE;
    gpu_param_t gpu_param = {
        3,
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0}
        };
    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
    vsi_int_array_t * out_shape                 = NULL;
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
    out_shape  = output_attr->shape;
    gpu_param.global_scale[0]  = 1;
    gpu_param.global_scale[1]  = 1;
    gpu_param.global_scale[2]  = 1;
    gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
    gpu_param.global_size[0] = gpu_align_p2(
            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
            / gpu_param.global_scale[0], 4);
    gpu_param.global_size[1] = (
            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
            / gpu_param.global_scale[1]);
    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
    SAFE_FREE_TENSOR_ATTR(output_attr);
    return status;
 } /* _resize_1d_nearest_initializer() */
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
    vsi_bool *is_use_u8_kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_dtype_e in_dtype  = F16;
    vsi_nn_kernel_dtype_e out_dtype = F16;
    const _kernel_map_type * kernel_map = _resize_1d_nearest_kernel_map;
    size_t kernel_map_size              = _cnt_of_array( _resize_1d_nearest_kernel_map );
    vx_param_description_t * param_def  = _resize_1d_nearest_kernel_param_def;
    size_t param_def_size               = _cnt_of_array( _resize_1d_nearest_kernel_param_def );
    vx_kernel_initialize_f  initializer = _resize_1d_nearest_initializer;
    uint32_t key = 0;
    uint32_t i   = 0;
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    if (F16 == in_dtype)
    {
        in_dtype = F32;
    }
    if (F16 == out_dtype)
    {
        out_dtype = F32;
    }
    if ((U8 == in_dtype) || (U8 == out_dtype))
    {
        param_def_size = _RESIZE_1D_NEAREST_QUANT_NUM;
        *is_use_u8_kernel = TRUE;
    }
    else
    {
        param_def_size = _RESIZE_1D_NEAREST_PARAM_NUM;
        *is_use_u8_kernel = FALSE;
    }
    key = RESIZE_1D_NEAREST_HASH_KEY( in_dtype, out_dtype );
    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if ( kernel_map[i].key == key )
        {
            break;
        }
    }
    if ( i < (uint32_t)kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
        kernel->info.numParams   = (uint32_t)param_def_size;
        kernel->info.initialize  = initializer;
        // Register code source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
                kernel_map[i].source_name );
        // Register binary source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
                kernel_map[i].source_name );
        status = VSI_SUCCESS;
    }
    return status;
 } /* _query_kernel() */
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_RESIZE_1D_NEAREST_QUANT_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
    int32_t in_width     = inputs[0]->attr.size[0];
    int32_t out_width    = outputs[0]->attr.size[0];
    float   input_zp     = (float)inputs[0]->attr.dtype.zero_point;
    float   input_scale  = inputs[0]->attr.dtype.scale;
    float   output_scale = (0 == outputs[0]->attr.dtype.scale) ? \
                           input_scale : input_scale / outputs[0]->attr.dtype.scale;
    float   output_tail  = (float)outputs[0]->attr.dtype.zero_point - input_zp * output_scale;
    float   half_pixel_value = 0.0f;
    float   round_value    = 0.0f;
    float   scale_factor_x = 0.0f;
    vsi_bool is_use_u8_kernel = FALSE;
    if (align_corners && out_width > 1)
    {
        scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
    }
    else
    {
        scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
    }
    if (align_corners)
    {
        round_value = 0.5f;
    }
    else
    {
        round_value = 0.0f;
    }
    if (half_pixel_centers)
    {
        half_pixel_value = 0.5f;
    }
    else
    {
        half_pixel_value = 0.0f;
    }
    status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            size_t node_params_num = _RESIZE_1D_NEAREST_PARAM_NUM;
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_1D_NEAREST_PARAM_NUM,
                    inputs, input_num, outputs, output_num );
            node_params[SCALAR_SCALE_X]     = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x );
            node_params[SCALAR_HALF_PIXEL]  = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value );
            node_params[SCALAR_ROUND_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &round_value );
            if (is_use_u8_kernel)
            {
                node_params[SCALAR_SCALE_VALUE]  = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
                node_params[SCALAR_TAIL_VALUE]   = vsi_nn_kernel_scalar_create(graph, F32, &output_tail );
                node_params_num = _RESIZE_1D_NEAREST_QUANT_NUM;
            }
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ROUND_VALUE] );
            if (is_use_u8_kernel)
            {
                vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] );
                vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL_VALUE] );
            }
        }
    }
    return node;
 } /* _setup() */
 __END_DECLS
 REGISTER_BACKEND_CL( resize_1d_nearest, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
@ -49,7 +49,7 @@ typedef enum
 } unary_type_e;
-#define _CPU_ARG_NUM            (1)
+#define _CPU_ARG_NUM            (2)
 #define _CPU_INPUT_NUM          (1)
 #define _CPU_OUTPUT_NUM         (1)
 #define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
@ -71,9 +71,9 @@ static float log_eval(float data)
    return logf(data);
 }
-static float elu_eval(float data)
+static float elu_eval(float data, float alpha)
 {
-    return data >=0 ? data : expf(data) - 1;
+    return data >=0 ? data : expf(data) * alpha - alpha;
 }
 static float neg_eval(float data)
@ -114,6 +114,7 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
    size_t out_elements = 0;
    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
    int32_t i;
    float alpha = 0;
    int32_t unary_type = 0;
    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
@ -126,6 +127,8 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &unary_type);
    CHECK_STATUS_FAIL_GOTO(status, final );
    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &alpha);
    CHECK_STATUS_FAIL_GOTO(status, final );
    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
@ -151,7 +154,7 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
            data = log_eval(data);
            break;
        case UNARY_ELU:
-            data = elu_eval(data);
+            data = elu_eval(data, alpha);
            break;
        case UNARY_NEG:
            data = neg_eval(data);
@ -193,9 +196,11 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define INPUT_FUNC_TYPE           (2)
 #define INPUT_SCALAR_ALPHA        (3)
 static const vx_kernel_description_t _kernel_info =
 {
@ -237,6 +242,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_status status = VSI_SUCCESS;
    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
    status = _query_kernel( inputs, outputs, kernel );
    if( VSI_SUCCESS == status)
@ -249,10 +255,13 @@ static vsi_nn_kernel_node_t _setup
                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
            backend_params[INPUT_FUNC_TYPE] = vsi_nn_kernel_scalar_create(
                    graph, I32, &unary_type );
            backend_params[INPUT_SCALAR_ALPHA] = vsi_nn_kernel_scalar_create(
                    graph, F32, &alpha );
            /* Pass parameters to node. */
            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
            vsi_nn_kernel_scalar_release( &backend_params[INPUT_FUNC_TYPE] );
            vsi_nn_kernel_scalar_release( &backend_params[INPUT_SCALAR_ALPHA] );
        }
        else
        {
--- a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
@ -258,11 +258,14 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    int32_t axis_num  = 0;
-    int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", (size_t*)&axis_num);
+    size_t axis_num_temp = 0;
    int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num_temp);
    vsi_bool is_continue_axis = TRUE;
    uint32_t mask = 0;
    int32_t i = 0;
    axis_num = (int32_t)axis_num_temp;
    for ( i = 1; i < axis_num; i++)
    {
        if ( axis[i] != (axis[i - 1] + 1) && axis[0] == 0)
--- a/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c
@ -0,0 +1,271 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"
 __BEGIN_DECLS
 /*
 * Define kernel meta.
 */
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 #define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.resize_1d_bilinear")
 /*
 * Kernel params
 */
 static vx_param_description_t _resize_1d_bilinear_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _RESIZE_1D_BILINEAR_PARAM_NUM  _cnt_of_array( _resize_1d_bilinear_kernel_param_def )
 #define SCALAR_ALIGN_CORNERS         (2)
 #define SCALAR_HALF_PIXEL            (3)
 /*
 * Kernel function
 */
 DEF_KERNEL_EXECUTOR(_compute)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    size_t   out_elements[_OUTPUT_NUM] = {0};
    size_t   out_bytes[_OUTPUT_NUM] = {0};
    uint32_t i = 0;
    int32_t  align_corners = 0;
    int32_t  half_pixel_centers = 0;
    float    width_scale = 1.0f;
    uint32_t input_width = 0, output_width = 0;
    uint32_t w = 0, out = 0;
    uint32_t output_dims = 0;
    float    data00 = .0f, data01 = .0f, interpolation = .0f;
    uint32_t index = 0;
    uint32_t outer = 0;
    /* prepare data */
    for (i = 0; i < _INPUT_NUM; i ++)
    {
        input[i] = (vsi_nn_kernel_tensor_t)param[i];
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
    }
    for (i = 0; i < _OUTPUT_NUM; i ++)
    {
        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
        out_bytes[i] = out_elements[i] * sizeof(float);
        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
        memset( f32_out_buffer[i], 0, out_bytes[i] );
    }
    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers));
    input_width       = in_attr[0]->shape->data[0];
    output_width      = out_attr[0]->shape->data[0];
    output_dims       = (uint32_t)out_attr[0]->shape->size;
    if (align_corners && output_width > 1)
    {
        width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1);
    }
    else
    {
        width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width;
    }
    outer = 1;
    for (i = 1; i < output_dims; i++)
    {
        outer = outer * out_attr[0]->shape->data[i];
    }
    for (out = 0; out < outer; out++)
    {
        vx_int32 input_base = out * input_width;
        vx_int32 output_base = out * output_width;
        for (w = 0; w < output_width; w ++)
        {
            vx_float32 input_w;
            vx_int32 w0;
            vx_int32 w1;
            if (half_pixel_centers)
            {
                input_w = ((vx_float32)w + 0.5f) * width_scale - 0.5f;
            }
            else
            {
                input_w = w * width_scale;
            }
            w0 = (vx_int32)input_w;
            w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vx_int32)(input_width - 1));
            index = input_base + w0;
            data00 = f32_in_buffer[0][index];
            index = input_base + w1;
            data01 = f32_in_buffer[0][index];
            interpolation = data00 * (1 - (input_w - w0)) +
                            data01 * (input_w - w0);
            index = output_base + w;
            f32_out_buffer[0][index] = interpolation;
        }
    }
    /* save data */
    for (i = 0; i < _OUTPUT_NUM; i++)
    {
        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
                f32_out_buffer[i], out_elements[i] );
        CHECK_STATUS_FAIL_GOTO( status, final );
    }
 final:
    for (i = 0; i < _INPUT_NUM; i++)
    {
        if (f32_in_buffer[i])
        {
            free(f32_in_buffer[i]);
            f32_in_buffer[i] = NULL;
        }
        if (in_attr[i])
        {
            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
        }
    }
    for (i = 0; i < _OUTPUT_NUM; i++)
    {
        if (f32_out_buffer[i])
        {
            free(f32_out_buffer[i]);
            f32_out_buffer[i] = NULL;
        }
        if (out_attr[i])
        {
            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
        }
    }
    return status;
 } /* _compute() */
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs
    /* Add extra params */
    )
 {
    vsi_status status = VSI_FAILURE;
    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
    kernel->info.function    = _compute;
    kernel->info.parameters  = _resize_1d_bilinear_kernel_param_def;
    kernel->info.numParams   = _cnt_of_array( _resize_1d_bilinear_kernel_param_def );
    status = VSI_SUCCESS;
    return status;
 } /* _query_kernel() */
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_RESIZE_1D_BILINEAR_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
    status = _query_kernel( kernel, inputs, outputs );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_1D_BILINEAR_PARAM_NUM,
                    inputs, input_num, outputs, output_num );
            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_1D_BILINEAR_PARAM_NUM );
            VSI_ASSERT( status == VSI_SUCCESS );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
        }
    }
    return node;
 } /* _setup() */
 __END_DECLS
 REGISTER_BACKEND_CPU( resize_1d_bilinear, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c
@ -0,0 +1,271 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"
 __BEGIN_DECLS
 /*
 * Define kernel meta.
 */
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 #define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.resize_1d_nearest")
 /*
 * Kernel params
 */
 static vx_param_description_t _resize_1d_nearest_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _RESIZE_1D_NEAREST_PARAM_NUM  _cnt_of_array( _resize_1d_nearest_kernel_param_def )
 #define SCALAR_ALIGN_CORNERS         (2)
 #define SCALAR_HALF_PIXEL            (3)
 /*
 * Kernel function
 */
 DEF_KERNEL_EXECUTOR(_compute)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    size_t   out_elements[_OUTPUT_NUM] = {0};
    size_t   out_bytes[_OUTPUT_NUM] = {0};
    uint32_t i = 0;
    int32_t  align_corners = 0;
    int32_t  half_pixel_centers = 0;
    float    width_scale = 1.0f;
    uint32_t input_width = 0, output_width = 0;
    uint32_t w = 0, out = 0;
    uint32_t output_dims = 0;
    uint32_t outer = 0;
    /* prepare data */
    for (i = 0; i < _INPUT_NUM; i ++)
    {
        input[i] = (vsi_nn_kernel_tensor_t)param[i];
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
    }
    for (i = 0; i < _OUTPUT_NUM; i ++)
    {
        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
        out_bytes[i] = out_elements[i] * sizeof(float);
        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
        memset( f32_out_buffer[i], 0, out_bytes[i] );
    }
    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers));
    input_width       = in_attr[0]->shape->data[0];
    output_width      = out_attr[0]->shape->data[0];
    output_dims       = (uint32_t)out_attr[0]->shape->size;
    if (align_corners && output_width > 1)
    {
        width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1);
    }
    else
    {
        width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width;
    }
    outer = 1;
    for (i = 1; i < output_dims; i++)
    {
        outer = outer * out_attr[0]->shape->data[i];
    }
    for (out = 0; out < outer; out++)
    {
        vx_int32 input_base  = out * input_width;
        vx_int32 output_base = out * output_width;
        for (w = 0; w < output_width; w ++)
        {
            float      input_w;
            uint32_t   in_x;
            int32_t    in_index;
            int32_t    out_index;
            if (half_pixel_centers)
            {
                input_w = ((float)w + 0.5f) * width_scale;
            }
            else
            {
                input_w = w * width_scale;
            }
            if (align_corners)
            {
                in_x = vsi_nn_min((uint32_t)simple_round(input_w), input_width - 1);
            }
            else
            {
                in_x = vsi_nn_min((uint32_t)floorf(input_w), input_width - 1);
            }
            in_index    = in_x + input_base;
            out_index   = w + output_base;
            f32_out_buffer[0][out_index] = f32_in_buffer[0][in_index];
        }
    }
    /* save data */
    for (i = 0; i < _OUTPUT_NUM; i++)
    {
        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
                f32_out_buffer[i], out_elements[i] );
        CHECK_STATUS_FAIL_GOTO( status, final );
    }
 final:
    for (i = 0; i < _INPUT_NUM; i++)
    {
        if (f32_in_buffer[i])
        {
            free(f32_in_buffer[i]);
            f32_in_buffer[i] = NULL;
        }
        if (in_attr[i])
        {
            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
        }
    }
    for (i = 0; i < _OUTPUT_NUM; i++)
    {
        if (f32_out_buffer[i])
        {
            free(f32_out_buffer[i]);
            f32_out_buffer[i] = NULL;
        }
        if (out_attr[i])
        {
            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
        }
    }
    return status;
 } /* _compute() */
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs
    /* Add extra params */
    )
 {
    vsi_status status = VSI_FAILURE;
    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
    kernel->info.function    = _compute;
    kernel->info.parameters  = _resize_1d_nearest_kernel_param_def;
    kernel->info.numParams   = _cnt_of_array( _resize_1d_nearest_kernel_param_def );
    status = VSI_SUCCESS;
    return status;
 } /* _query_kernel() */
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_RESIZE_1D_NEAREST_PARAM_NUM];
    vsi_nn_kernel_node_t node = NULL;
    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
    status = _query_kernel( kernel, inputs, outputs );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_1D_NEAREST_PARAM_NUM,
                    inputs, input_num, outputs, output_num );
            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_1D_NEAREST_PARAM_NUM );
            VSI_ASSERT( status == VSI_SUCCESS );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
        }
    }
    return node;
 } /* _setup() */
 __END_DECLS
 REGISTER_BACKEND_CPU( resize_1d_nearest, _setup )
--- a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
@ -279,10 +279,10 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
    if( U8 == input_dtype && F16 == output_dtype )
    {
-        vx_uint16  M0                 = 0;
+        uint16_t   M0                 = 0;
-        vx_int8    postShift          = 0;
+        int32_t    postShift          = 0;
-        vx_uint32  multAndoutZP0[2]   = {0};
+        uint32_t   multAndoutZP0[2]   = {0};
-        vx_uint32  multAndoutZP1[2]   = {0};
+        uint32_t   multAndoutZP1[2]   = {0};
        gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
            0xdddddddd, // TCfg
@ -305,12 +305,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
-        vsi_nn_GetFP32MultiAndPostShift(scaleIn / scaleOut, &M0, &postShift);
+        gpu_quantize_multiplier_16bit(scaleIn / scaleOut, &M0, &postShift);
        multAndoutZP0[0] = (vx_uint32)(M0);
        multAndoutZP0[1] = (vx_uint32)((output_ZP << postShift) - input_ZP * M0);
        uniU8MulAndPostShift_0_Lo_2x8.data[7] |= (postShift & 0x1F);
-        vsi_nn_GetFP32MultiAndPostShift(scaleIn1 / scaleOut, &M0, &postShift);
+        gpu_quantize_multiplier_16bit(scaleIn1 / scaleOut, &M0, &postShift);
        multAndoutZP1[0] = (vx_uint32)(M0);
        multAndoutZP1[1] = (vx_uint32)((output_ZP << postShift) - input_ZP1 * M0);
        uniU8MulAndPostShift_1_Lo_2x8.data[7] |= (postShift & 0x1F);
--- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
@ -268,7 +268,7 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
        {
            uint32_t  multAndoutZP[2]    = {0};
            uint16_t  M0                 = 0;
-            int8_t    postShift          = 0;
+            int32_t   postShift          = 0;
            gpu_dp_inst_t  uniDataMulAndPostShift_2x8 = {{
                0xdddddddd, // TCfg
                0x44444444, // ASelt
@ -279,7 +279,7 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16};
-            vsi_nn_GetFP32MultiAndPostShift(scaleIn / scaleOut, &M0, &postShift);
+            gpu_quantize_multiplier_16bit(scaleIn / scaleOut, &M0, &postShift);
            multAndoutZP[0] = (uint32_t)(M0);
            multAndoutZP[1] = (uint32_t)(output_ZP << postShift );
@ -434,8 +434,8 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
        int32_t   packedMaxData[4];
        float     uint8Scale = scaleIn / scaleOut;
        uint16_t  M0                   = 0;
-        int8_t    postShift            = 0;
+        int32_t   postShift            = 0;
-        uint32_t    multAndoutZP[2]    = {0};
+        uint32_t  multAndoutZP[2]      = {0};
        gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
            0xdddddddd, // TCfg
            0x44444444, // ASelt
@ -457,7 +457,7 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
-        vsi_nn_GetFP32MultiAndPostShift(uint8Scale, &M0, &postShift);
+        gpu_quantize_multiplier_16bit(uint8Scale, &M0, &postShift);
        multAndoutZP[0] = (uint32_t)(M0);
        multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * M0);
--- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
@ -702,27 +702,66 @@ static vsi_nn_kernel_node_t _setup
    )
 {
    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_DEPTHWISE_CONV1D_PARAM_NUM];
+    vsi_nn_kernel_node_param_t node_params[_DEPTHWISE_CONV1D_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    int32_t weight_pad_front[VSI_NN_MAX_DIM_NUM] = {0};
    int32_t weight_pad_end[VSI_NN_MAX_DIM_NUM] = {0};
    vsi_nn_tensor_t * weights = NULL;
    vsi_nn_tensor_t * biases = NULL;
    vsi_nn_tensor_t *temp_tensor[3] = {NULL};
    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
    int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
    int32_t new_rank = 2;
    uint32_t i = 0;
    int32_t stride     = vsi_nn_kernel_param_get_int32( params, "stride" );
    int32_t pad_front  = vsi_nn_kernel_param_get_int32( params, "pad_front" );
    int32_t pad_end  = vsi_nn_kernel_param_get_int32( params, "pad_end" );
    int32_t dilation   = vsi_nn_kernel_param_get_int32( params, "dilation" );
    _internal_kernel_size_e ks   = KN;
-    weight_pad_end[0] = gpu_align_np2_safe(inputs[1]->attr.size[0], 8) - inputs[1]->attr.size[0];
+    if (!((VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type)
       && (VSI_NN_TYPE_UINT8 == inputs[1]->attr.dtype.vx_type)
       && (NULL == inputs[2] || VSI_NN_TYPE_INT32 == inputs[2]->attr.dtype.vx_type)
       && (VSI_NN_TYPE_UINT8 == outputs[0]->attr.dtype.vx_type)))
    {
        return NULL;
    }
-    weights = vsi_nn_pad_tensor(graph, inputs[1], weight_pad_front, weight_pad_end, inputs[1]->attr.dim_num,
+    reshape_tensors[0] = inputs[0];
        VSI_NN_PAD_MODE_CONSTANT, 0);
-    biases = vsi_nn_merge_input_zeropoint_to_bias(graph, inputs[0], inputs[1], inputs[2]);
+    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
    {
        shape[0] = inputs[1]->attr.size[0];
        shape[1] = 1;
        for (i = 1; i < inputs[1]->attr.dim_num; i++)
        {
            shape[1] *= inputs[1]->attr.size[i];
        }
        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
                inputs[1], (uint32_t*)shape, new_rank );
    }
    else
    {
        reshape_tensors[1] = inputs[1];
    }
-    temp_tensor[0] = inputs[0];
+    if (inputs[2] && inputs[2]->attr.dim_num == 1)
    {
        shape[0] = inputs[2]->attr.size[0];
        shape[1] = 1;
        new_rank = 2;
        reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
                inputs[2], (uint32_t*)shape, new_rank );
    }
    weight_pad_end[0] = gpu_align_np2_safe(reshape_tensors[1]->attr.size[0], 8) - reshape_tensors[1]->attr.size[0];
    weights = vsi_nn_pad_tensor(graph, reshape_tensors[1], weight_pad_front, weight_pad_end,
        reshape_tensors[1]->attr.dim_num, VSI_NN_PAD_MODE_CONSTANT, 0);
    biases = vsi_nn_merge_input_zeropoint_to_bias(graph, reshape_tensors[0], reshape_tensors[1], reshape_tensors[2]);
    temp_tensor[0] = reshape_tensors[0];
    temp_tensor[1] = weights;
    temp_tensor[2] = biases;
@ -760,6 +799,16 @@ static vsi_nn_kernel_node_t _setup
        }
    }
    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
    {
        vsi_nn_ReleaseTensor( &reshape_tensors[1] );
    }
    if (inputs[2] && inputs[2]->attr.dim_num == 1)
    {
        vsi_nn_ReleaseTensor( &reshape_tensors[2] );
    }
    if (weights)
    {
        vsi_nn_ReleaseTensor(&weights);
--- a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
@ -164,9 +164,9 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
    else if ((U8 == input_attr->dtype) || (U8 == input1_attr->dtype))
    {
        uint16_t  M0                 = 0;
-        int8_t    postShift0         = 0;
+        int32_t   postShift0         = 0;
        uint16_t  M1                 = 0;
-        int8_t    postShift1         = 0;
+        int32_t   postShift1         = 0;
        uint32_t  i                  = 0;
        gpu_dp_inst_t uniU8SubZptoF32Conv0_4x4 = {{
            0x09090909, // TCfg
@ -188,8 +188,8 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
            0x00010001, 0x00000000, 0x00010001, 0x00000000,
            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };
-        vsi_nn_GetFP32MultiAndPostShift(scaleIn0, &M0, &postShift0);
+        gpu_quantize_multiplier_16bit(scaleIn0, &M0, &postShift0);
-        vsi_nn_GetFP32MultiAndPostShift(scaleIn1, &M1, &postShift1);
+        gpu_quantize_multiplier_16bit(scaleIn1, &M1, &postShift1);
        uniU8SubZptoF32Conv0_4x4.data[7] |= (postShift0 & 0x1F);
        uniU8SubZptoF32Conv1_4x4.data[7] |= (postShift1 & 0x1F);
        for ( i = 0; i < 8; i++ )
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@ -266,9 +266,11 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define INPUT_FUNC_TYPE           (2)
 #define INPUT_SCALAR_ALPHA        (3)
 #define _CL_PARAM_NUM          _cnt_of_array(kernel_param_def)
 /*
@ -296,6 +298,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
    float    inputTail                      = 0;
    float    outputScale                    = 1.0f;
    float    outputZP                       = 0;
    float    alpha                          = 0;
    uint32_t pack_key;
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
@ -303,7 +306,9 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &type);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[INPUT_FUNC_TYPE], &type);
    CHECK_STATUS_FAIL_GOTO(status, final );
    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[INPUT_SCALAR_ALPHA], &alpha);
    CHECK_STATUS_FAIL_GOTO(status, final );
    out_shape  = attr[1]->shape;
@ -408,6 +413,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
                    "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
            status |= vsi_nn_kernel_gpu_add_param( node,
                    "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
            status |= vsi_nn_kernel_gpu_add_param( node,
                    "alpha", &alpha );
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        break;
@ -466,6 +473,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
                    "outputScale", &outputScale );
            status |= vsi_nn_kernel_gpu_add_param( node,
                    "outputZP", &outputZP );
            status |= vsi_nn_kernel_gpu_add_param( node,
                    "alpha", &alpha );
            if (attr[1]->dtype == F16)
            {
@ -555,7 +564,8 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_tensor_t* rs_tensors[2] = { NULL };
    int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
    int32_t new_rank = 0;
-    vsi_bool ret;
+    vsi_bool ret = FALSE;
    float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
    ret = vsi_nn_kernel_optimize_element_shape(
            (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num,
@ -586,6 +596,8 @@ static vsi_nn_kernel_node_t _setup
                    rs_tensors, 1, &rs_tensors[1], 1 );
            node_params[INPUT_FUNC_TYPE] = vsi_nn_kernel_scalar_create(
                    graph, I32, &unary_type );
            node_params[INPUT_SCALAR_ALPHA] = vsi_nn_kernel_scalar_create(
                    graph, F32, &alpha );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
@ -609,6 +621,11 @@ OnError:
        vsi_nn_kernel_scalar_release( &node_params[INPUT_FUNC_TYPE] );
    }
    if (node_params[INPUT_SCALAR_ALPHA])
    {
        vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_ALPHA] );
    }
    return node;
 } /* _setup() */
--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@ -181,8 +181,8 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
    float       dstScale   = 0;
    uint16_t M0            = 0;
    uint16_t M1            = 0;
-    int8_t   postShift0    = 0;
+    int32_t  postShift0    = 0;
-    int8_t   postShift1    = 0;
+    int32_t  postShift1    = 0;
    uint32_t pack_key = 0;
    int32_t  ac2zero  = 0;
@ -279,8 +279,8 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
        dstScale = 1;
        dstZP = 0.0f;
    }
-    vsi_nn_GetFP32MultiAndPostShift(src0Scale / 1.0f, &M0, &postShift0);
+    gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postShift0);
-    vsi_nn_GetFP32MultiAndPostShift(src1Scale / 1.0f, &M1, &postShift1);
+    gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postShift1);
    mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP);
    inOutScale =  src0Scale * src1Scale / dstScale;
--- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
@ -588,7 +588,8 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
    int32_t axis_num  = 0;
-    int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", (size_t*)&axis_num);
+    size_t axis_num_temp = 0;
    int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num_temp);
    int32_t axis_first  = axis[0];
    int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 1, 1, 1, 1 } };
    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
@ -602,6 +603,8 @@ static vsi_nn_kernel_node_t _setup
    vsi_bool image_2d = FALSE;
    vsi_bool is_continue_axis = TRUE;
    axis_num = (int32_t)axis_num_temp;
    for ( i = 1; i < axis_num; i++)
    {
        if ( axis[i] != (axis[i - 1] + 1) && axis[0] == 0)
--- a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
@ -139,7 +139,7 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer)
    int32_t  input_fl                          = 0;
    int32_t  output_fl                         = 0;
    uint16_t M0                                = 0;
-    int8_t   postShift                         = 0;
+    int32_t  postShift                         = 0;
    float    inputScale                        = 1.0f;
    int32_t  input_ZP                          = 0;
    float    outputScale                       = 1.0f;
@ -193,7 +193,7 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer)
    if ( ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
       && ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) )
    {
-        vsi_nn_GetFP32MultiAndPostShift(inputScale / outputScale, &M0, &postShift);
+        gpu_quantize_multiplier_16bit(inputScale / outputScale, &M0, &postShift);
    }
    image_2d = (vsi_bool)(input_shape->size < 3 || 1 == input_shape->data[2]);
--- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
@ -196,8 +196,8 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
    float       dstZP      = 0;
    float       dstScale   = 1.0f;
-    int8_t     postshift0  = 0;
+    int32_t    postshift0  = 0;
-    int8_t     postshift1  = 0;
+    int32_t    postshift1  = 0;
    float      outScale_fl = 1;
    uint16_t M0            = 0;
@ -229,7 +229,7 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
        src0ZP     = attr[0]->asymm.zero_point;
        src0Scale  = attr[0]->asymm.scale;
-        vsi_nn_GetFP32MultiAndPostShift(src0Scale / 1.0f, &M0, &postshift0);
+        gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postshift0);
    }
    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
@ -243,7 +243,7 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
        src1ZP     = attr[1]->asymm.zero_point;
        src1Scale  = attr[1]->asymm.scale;
-        vsi_nn_GetFP32MultiAndPostShift(src1Scale / 1.0f, &M1, &postshift1);
+        gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postshift1);
    }
    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
--- a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
--- a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
@ -0,0 +1,533 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"
 __BEGIN_DECLS
 /*
 * Define kernel meta.
 */
 typedef enum
 {
    LARGE = 0,
    SMALL
 } _internal_nearest_e;
 #define _RESIZE_1D_NEAREST_KERNEL_SOURCE      "resize_1d_nearest"
 #define STR(a) #a
 // Add kernel hashtable here
 #define RESIZE_1D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE, mode ) \
        (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (mode))
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
        { RESIZE_1D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE, LARGE ), \
          CVIVANTE_NAMESPACE("evis.resize_1d_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
          _RESIZE_1D_NEAREST_KERNEL_SOURCE }
 #define PACK_KERNEL_MAP_OPT( IN_DTYPE, OUT_DTYPE ) \
        { RESIZE_1D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE, SMALL ), \
          CVIVANTE_NAMESPACE("evis.resize_1d_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_op"), \
          _RESIZE_1D_NEAREST_KERNEL_SOURCE }
 typedef struct
 {
    uint32_t key;
    char * function_name;
    const char * source_name;
 } _kernel_map_type;
 static const _kernel_map_type _resize_1d_nearest_kernel_map[] =
 {
    // Register kernel here
    PACK_KERNEL_MAP(F16, F16),
    PACK_KERNEL_MAP(I16, I16),
    PACK_KERNEL_MAP(I8, I8),
    PACK_KERNEL_MAP(U8, U8),
    PACK_KERNEL_MAP_OPT(F16, F16),
    PACK_KERNEL_MAP_OPT(I16, I16),
    PACK_KERNEL_MAP_OPT(I8, I8),
    PACK_KERNEL_MAP_OPT(U8, U8),
 };
 /*
 * Kernel params
 */
 static vx_param_description_t _resize_1d_nearest_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _RESIZE_1D_NEAREST_PARAM_NUM  _cnt_of_array( _resize_1d_nearest_kernel_param_def )
 #define SCALAR_ALIGN_CORNERS         (2)
 #define SCALAR_HALF_PIXEL            (3)
 /*
 * Kernel initializer
 */
 DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
 #define MAX_POST_SHIFT_BITS     (31)
 #define MAX_MULTIPLIER_NUM      (65535)
    vsi_status status = VSI_FAILURE;
    gpu_param_t gpu_param = {
        3,
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0}
        };
    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
    vsi_int_array_t             * out_shape     = NULL;
    vsi_int_array_t             * in_shape      = NULL;
    vsi_nn_kernel_dtype_e         input_dtype   = F16;
    vsi_nn_kernel_dtype_e         output_dtype  = F16;
    int32_t     align_corners      = 0;
    int32_t     half_pixel_centers = 0;
    uint32_t    depth              = 0;
    int32_t     srcFixPointPos     = 0;
    int32_t     dstFixPointPos     = 0;
    float       input_scale        = 1.0;
    int32_t     inputZP            = 0;
    float       output_scale       = 1.0;
    int32_t     outputZP           = 0;
    float       scale_factor       = 1.0f;
    uint32_t    in_width           = 0;
    uint32_t    out_width          = 0;
    uint32_t    out_height         = 0;
    float       half_pixel_value   = 0.0f;
    float       round_value        = 0.0f;
    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &align_corners);
    CHECK_STATUS_FAIL_GOTO(status, final );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &half_pixel_centers);
    CHECK_STATUS_FAIL_GOTO(status, final );
    out_shape     = output_attr->shape;
    in_shape      = input_attr->shape;
    input_dtype   = input_attr->dtype;
    output_dtype  = output_attr->dtype;
    in_width          = in_shape->data[0];
    depth             = in_shape->data[2];
    out_width         = out_shape->data[0];
    out_height        = out_shape->data[1];
    if (BF16 == input_dtype && output_dtype == BF16)
    {
        input_dtype  = F16;
        output_dtype = F16;
    }
    if (align_corners && out_width > 1)
    {
        scale_factor = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1);
    }
    else
    {
        scale_factor = ((float)in_width * 1.0f) / (float)out_width;
    }
    if (align_corners)
    {
        round_value = 0.5f;
    }
    else
    {
        round_value = 0.0f;
    }
    if (half_pixel_centers)
    {
        half_pixel_value = 0.5f;
    }
    else
    {
        half_pixel_value = 0.0f;
    }
    if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
    {
        input_scale    = input_attr->asymm.scale;
        inputZP        = input_attr->asymm.zero_point;
    }
    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
    {
        srcFixPointPos   = input_attr->dfp.fl;
        if (srcFixPointPos >= 0)
        {
            input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
        }
        else if (srcFixPointPos < 0)
        {
            input_scale = (float)((int64_t)1 << -srcFixPointPos);
        }
        inputZP = 0;
    }
    else
    {
        input_scale = 1.0f;
        inputZP     = 0;
    }
    if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
    {
        output_scale   = 1.0f / output_attr->asymm.scale;
        outputZP       = output_attr->asymm.zero_point;
    }
    else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
    {
        dstFixPointPos = output_attr->dfp.fl;
        if (dstFixPointPos >= 0)
        {
            output_scale = (float) ((int64_t)1 << dstFixPointPos);
        }
        else if (dstFixPointPos < 0)
        {
            output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
        }
        outputZP = 0;
    }
    else
    {
        output_scale = 1.0;
        outputZP     = 0;
    }
    if (F16 == input_dtype && F16 == output_dtype)
    {
        gpu_dp_inst_t uniGetExtractData_2x8 = {{
            0x00009999, // TCfg
            0x00000000, // ASelt
            0x06040200, 0x00000000, // ABin
            0x0000aaaa, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000300, // AccumType, ConstantType, and PostShift
            0x00100010, 0x00100010, 0x00100010, 0x00100010,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        if (scale_factor < 4.0f)
        {
            status  = vsi_nn_kernel_gpu_add_param( node, "uniGetExtractData_2x8", &uniGetExtractData_2x8);
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        gpu_param.global_scale[0] = 4;
        gpu_param.global_scale[1] = 1;
        gpu_param.global_scale[2] = 1;
        status  = vsi_nn_kernel_gpu_add_param( node, "scale_x", &scale_factor);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
    else if ( input_dtype == output_dtype && (I8 == input_dtype || I16 == input_dtype))
    {
        gpu_dp_inst_t uniGetExtractData_2x8 = {{
            0x00009999, // TCfg
            0x00000000, // ASelt
            0x06040200, 0x00000000, // ABin
            0x0000aaaa, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000300, // AccumType, ConstantType, and PostShift
            0x00080008, 0x00080008, 0x00080008, 0x00080008,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniConvertI8toI8_2x8 = {{
            0x11111111, // TCfg
            0x00000000, // ASelt
            0x03020100, 0x07060504, // ABin
            0x22222222, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000600, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000001, 0x00000001, 0x00000001,
            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16};
        if (I16 == input_dtype)
        {
            uniGetExtractData_2x8.data[8]  = 0x00100010;
            uniGetExtractData_2x8.data[9]  = 0x00100010;
            uniGetExtractData_2x8.data[10] = 0x00100010;
            uniGetExtractData_2x8.data[11] = 0x00100010;
            uniGetExtractData_2x8.data[12] = 0x00100010;
            uniGetExtractData_2x8.data[13] = 0x00100010;
            uniGetExtractData_2x8.data[14] = 0x00100010;
            uniGetExtractData_2x8.data[15] = 0x00100010;
        }
        if (srcFixPointPos > dstFixPointPos)
        {
            int32_t  postshift      = vsi_nn_min(srcFixPointPos - dstFixPointPos, MAX_POST_SHIFT_BITS);
            uniConvertI8toI8_2x8.data[7] |= (postshift & 0x1F);
        }
        else
        {
            uint32_t multiplier = vsi_nn_min((int64_t)1 << (dstFixPointPos - srcFixPointPos), MAX_MULTIPLIER_NUM);
            uint32_t i          = 0;
            for (i = 0; i < 8; i++)
            {
                uniConvertI8toI8_2x8.data[i + 8] = multiplier;
            }
        }
        if (scale_factor < 4.0f)
        {
            status  = vsi_nn_kernel_gpu_add_param( node, "uniGetExtractData_2x8", &uniGetExtractData_2x8);
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        gpu_param.global_scale[0] = 4;
        gpu_param.global_scale[1] = 1;
        gpu_param.global_scale[2] = 1;
        status  = vsi_nn_kernel_gpu_add_param( node, "scale_x", &scale_factor);
        status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertI8toI8_2x8", &uniConvertI8toI8_2x8);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
    else if (U8 == input_dtype && U8 == output_dtype)
    {
        uint16_t  M0                   = 0;
        int32_t   postShift            = 0;
        uint32_t  multAndoutZP[2]      = {0};
        gpu_dp_inst_t uniMultiplyAndPostShift_2x8 = {{
            0xdddddddd, // TCfg
            0x44444444, // ASelt
            0x13121110, 0x17161514, // ABin
            0x11111111, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00002400, // AccumType, ConstantType, and PostShift
            0x00000000, 0x00000000, 0x00000000, 0x00000000,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniGetExtractData_2x8 = {{
            0x00009999, // TCfg
            0x00000000, // ASelt
            0x06040200, 0x00000000, // ABin
            0x0000aaaa, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000300, // AccumType, ConstantType, and PostShift
            0x00080008, 0x00080008, 0x00080008, 0x00080008,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_quantize_multiplier_16bit(input_scale * output_scale, &M0, &postShift);
        multAndoutZP[0] = (uint32_t)(M0);
        multAndoutZP[1] = (uint32_t)((outputZP << postShift) - inputZP * M0);
        uniMultiplyAndPostShift_2x8.data[7] |= (postShift & 0x1F);
        if (scale_factor < 4.0f)
        {
            status  = vsi_nn_kernel_gpu_add_param( node, "uniGetExtractData_2x8", &uniGetExtractData_2x8);
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        gpu_param.global_scale[0] = 4;
        gpu_param.global_scale[1] = 1;
        gpu_param.global_scale[2] = 1;
        status  = vsi_nn_kernel_gpu_add_param( node, "scale_x", &scale_factor);
        status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP);
        status |= vsi_nn_kernel_gpu_add_param( node, "uniMultiplyAndPostShift_2x8", &uniMultiplyAndPostShift_2x8);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
    status  = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
    status |= vsi_nn_kernel_gpu_add_param( node, "round_value", &round_value);
    CHECK_STATUS_FAIL_GOTO(status, final );
    gpu_param.global_size[0]   = gpu_align_p2((out_width  + gpu_param.global_scale[0] - 1)\
                                / gpu_param.global_scale[0], 4);
    gpu_param.global_size[1]   = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
    gpu_param.global_size[2]   = depth;
    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
 #undef MAX_MULTIPLIER_NUM
 #undef MAX_POST_SHIFT_BITS
 final:
    if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
    if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
    return status;
 } /* _resize_nearest_initializer() */
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
        int32_t align_corners
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_dtype_e in_dtype = F16;
    vsi_nn_kernel_dtype_e out_dtype = F16;
    const _kernel_map_type * kernel_map = _resize_1d_nearest_kernel_map;
    size_t kernel_map_size              = _cnt_of_array( _resize_1d_nearest_kernel_map );
    vx_param_description_t * param_def  = _resize_1d_nearest_kernel_param_def;
    size_t param_def_size               = _cnt_of_array( _resize_1d_nearest_kernel_param_def );
    vx_kernel_initialize_f  initializer = _resize_1d_nearest_initializer;
    uint32_t key = 0;
    uint32_t i   = 0;
    uint32_t inputWidth  = inputs[0]->attr.size[0];
    uint32_t outputWidth = outputs[0]->attr.size[0];
    float    scale_factor;
    _internal_nearest_e resize_mode = LARGE;
    if (align_corners && outputWidth > 1)
    {
        scale_factor = (vx_float32)(inputWidth - 1) / (vx_float32)(outputWidth - 1);
    }
    else
    {
        scale_factor = (vx_float32)inputWidth / (vx_float32)outputWidth;
    }
    if (scale_factor < 4.0f)
    {
        resize_mode = SMALL;
    }
    else
    {
        resize_mode = LARGE;
    }
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    if (BF16 == in_dtype && BF16 == out_dtype)
    {
        in_dtype  = F16;
        out_dtype = F16;
    }
    key = RESIZE_1D_NEAREST_HASH_KEY( in_dtype, out_dtype, resize_mode );
    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if( kernel_map[i].key == key )
        {
            break;
        }
    }
    if ( i < (uint32_t)kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
        kernel->info.numParams   = (uint32_t)param_def_size;
        kernel->info.initialize  = initializer;
        // Register code source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                "vsi_nn_kernel_header",
                kernel_map[i].source_name );
        // Register binary source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
                kernel_map[i].source_name );
        status = VSI_SUCCESS;
    }
    return status;
 } /* _query_kernel() */
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_RESIZE_1D_NEAREST_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
    status = _query_kernel( kernel, inputs, outputs, align_corners );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_1D_NEAREST_PARAM_NUM,
                    inputs, input_num, outputs, output_num );
            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
            node_params[SCALAR_HALF_PIXEL]    = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_1D_NEAREST_PARAM_NUM );
            VSI_ASSERT( status == VSI_SUCCESS );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
        }
    }
    return node;
 } /* _setup() */
 __END_DECLS
 REGISTER_BACKEND_EVIS( resize_1d_nearest, _setup )
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@ -850,7 +850,7 @@ static vsi_status _query_kernel
        }
    }
-    if ((UP_2X_HALF == scale_flag) && (i >= kernel_map_size))
+    if ((UP_2X_HALF == scale_flag) && (i >= kernel_map_size) && is_same_type && is_evis2)
    {
        scale_flag = UP_OPT;
        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
--- a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
@ -348,7 +348,7 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
    else if (U8 == input_dtype && U8 == output_dtype)
    {
        uint16_t  M0                   = 0;
-        vx_int8   postShift            = 0;
+        int32_t   postShift            = 0;
        uint32_t  multAndoutZP[2]      = {0};
        gpu_dp_inst_t uniMultiplyAndPostShift_2x8 = {{
            0xdddddddd, // TCfg
@ -371,7 +371,7 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
-        vsi_nn_GetFP32MultiAndPostShift(input_scale * output_scale, &M0, &postShift);
+        gpu_quantize_multiplier_16bit(input_scale * output_scale, &M0, &postShift);
        multAndoutZP[0] = (uint32_t)(M0);
        multAndoutZP[1] = (uint32_t)((outputZP << postShift) - inputZP * M0);
--- a/src/tim/vx/internal/src/kernel/evis/select_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c
@ -131,9 +131,9 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
    float    outputScale                    = 1.0f;
    int32_t  outputZP                       = 0;
    uint16_t in0_M0                         = 0;
-    int8_t   in0_postShift                  = 0;
+    int32_t  in0_postShift                  = 0;
    uint16_t in1_M0                         = 0;
-    int8_t   in1_postShift                  = 0;
+    int32_t  in1_postShift                  = 0;
    uint32_t pack_key                       = 0;
    input0_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
    CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@ -196,8 +196,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
        outputZP    = output_attr->asymm.zero_point;
    }
-    vsi_nn_GetFP32MultiAndPostShift(input0Scale / outputScale, &in0_M0, &in0_postShift);
+    gpu_quantize_multiplier_16bit(input0Scale / outputScale, &in0_M0, &in0_postShift);
-    vsi_nn_GetFP32MultiAndPostShift(input1Scale / outputScale, &in1_M0, &in1_postShift);
+    gpu_quantize_multiplier_16bit(input1Scale / outputScale, &in1_M0, &in1_postShift);
    pack_key = _PACK_SELECT_KEY( input0_attr->dtype, input1_attr->dtype, output_attr->dtype );
--- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
@ -309,7 +309,7 @@ DEF_KERNEL_INITIALIZER(_tile_initializer)
    {
        float     uint8Scale = scaleIn / scaleOut;
        uint16_t  M0                   = 0;
-        int8_t    postShift            = 0;
+        int32_t   postShift            = 0;
        uint32_t  multAndoutZP[2]      = {0};
        gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
@ -323,7 +323,7 @@ DEF_KERNEL_INITIALIZER(_tile_initializer)
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
-        vsi_nn_GetFP32MultiAndPostShift(uint8Scale, &M0, &postShift);
+        gpu_quantize_multiplier_16bit(uint8Scale, &M0, &postShift);
        multAndoutZP[0] = (uint32_t)(M0);
        multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * M0);
--- a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c
@ -154,7 +154,7 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer)
    int32_t  input_fl                          = 0;
    int32_t  output_fl                         = 0;
    uint16_t M0                                = 0;
-    int8_t   postShift                         = 0;
+    int32_t  postShift                         = 0;
    float    inputScale                        = 1.0f;
    int32_t  input_ZP                          = 0;
    float    outputScale                       = 1.0f;
@ -212,7 +212,7 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer)
    factorOut = 1.0f / outputScale;
-    vsi_nn_GetFP32MultiAndPostShift(inputScale / outputScale, &M0, &postShift);
+    gpu_quantize_multiplier_16bit(inputScale / outputScale, &M0, &postShift);
    image_2d = (vsi_bool)(input_shape->size < 3 || 1 == input_shape->data[2]);
--- a/src/tim/vx/internal/src/kernel/vsi_nn_gpu.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_gpu.c
@ -84,7 +84,7 @@ void gpu_quantize_multiplier_32bit
    double q;
    int64_t q_fixed;
    const int32_t bit = 32;
-    if( vsi_abs(double_multiplier - 0.0) < 1e-5 )
+    if( vsi_abs(double_multiplier - 0.0) < 1e-8 )
    {
        *quantize_multiplier = 0;
        *shift = bit - 0;
@ -116,6 +116,25 @@ void gpu_quantize_multiplier_32bit
    }
 } /* gpu_quantize_multiplier_32_bit() */
 void _modify_multiplier_postshift
    (
    uint16_t * quantize_multiplier,
    int32_t * shift
    )
 {
    uint16_t multiplier = *quantize_multiplier;
    int32_t  postshift  = *shift;
    while (postshift > GPU_MAX_POST_SHIFT_BITS)
    {
        multiplier = (multiplier + 1) >> 1;
        postshift --;
    }
    *quantize_multiplier = multiplier;
    *shift = postshift;
 }
 void gpu_quantize_multiplier_16bit
    (
    double double_multiplier,
@ -135,5 +154,8 @@ void gpu_quantize_multiplier_16bit
    {
        *shift -= bit;
    }
    _modify_multiplier_postshift(quantize_multiplier, shift);
 } /* gpu_quantize_multiplier_16bit() */
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@ -1145,10 +1145,10 @@ vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create
        status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_SCALE,
            &(attr->asymm.scale), sizeof(float));
        CHECK_STATUS( status );
-        // Reset scale to 1
+        // Reset scale to 1e-8
-        if( (attr->asymm.scale - 0.f) < 1e-5 )
+        if( (attr->asymm.scale - 0.f) < 1e-8 )
            {
-            attr->asymm.scale = 1.0f;
+            attr->asymm.scale = (float)1e-8;
            attr->asymm.zero_point = 0;
            }
        }
@ -1225,12 +1225,16 @@ vsi_status vsi_nn_kernel_pirority_set
 static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
 {
    char *envctrl;
-    int32_t enableShader = 1;
+    static int32_t enableShader = -1;
-    envctrl = getenv("VIV_VX_ENABLE_SHADER");
+    if (enableShader == -1)
    if (envctrl)
    {
-        enableShader = atoi(envctrl);
+        enableShader = 1;
        envctrl = getenv("VIV_VX_ENABLE_SHADER");
        if (envctrl)
        {
            enableShader = atoi(envctrl);
        }
    }
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
@ -1240,7 +1244,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
    }
 #endif
-    if(enableShader == 1)
+    if (enableShader >= 1)
    {
        return TRUE;
    }
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c
@ -459,7 +459,7 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape
        k = 0;
        for (j = 0; j < (size_t)input_num; j++)
        {
-            if (size_in[k] > 1)
+            if (size_in[j] > 1)
            {
                k = j;
                break;
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
@ -603,10 +603,14 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias
    vsi_nn_tensor_t * bias
    )
 {
-    vsi_nn_tensor_t * new_bias = NULL;
+    vsi_nn_tensor_t * new_bias   = NULL;
    vsi_nn_tensor_attr_t attr;
-
+    int32_t  *new_bias_data_ptr  = NULL;
    uint8_t  *weight_data        = NULL;
    int32_t  *bias_data          = NULL;
    uint32_t  i, j;
    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
    weight_data = vsi_nn_ConvertTensorToData(graph, weight);
    if (bias == NULL)
    {
@ -620,26 +624,47 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias
            attr.dtype.zero_point = 0;
            attr.dtype.vx_type = VSI_NN_TYPE_INT32;
        }
        else
        {
            VSILOGE("need to add ...");
        }
    }
    else
    {
        memcpy(&attr, &bias->attr, sizeof(vsi_nn_tensor_attr_t));
        if (attr.dim_num == 1)
        {
            attr.size[1]  = 1;
            attr.dim_num  = 2;
        }
        bias_data = (int32_t *)vsi_nn_ConvertTensorToData(graph, bias);
    }
-    new_bias = vsi_nn_CreateTensorWithDefault(graph, &attr, 0.0);
+    new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t));
    memset((void *)new_bias_data_ptr, 0, sizeof(int32_t) * attr.size[0]);
-    if (input->attr.dtype.zero_point == 0)
+    if (input->attr.dtype.zero_point != 0)
    {
-        return new_bias;
+        for (i = 0; i < weight->attr.size[1]; i++)
        {
            uint8_t *weight_ptr = weight_data + i * weight->attr.size[0];
            for (j = 0; j < weight->attr.size[0]; j++)
            {
                 new_bias_data_ptr[i] += -((int32_t)weight_ptr[j] - weight->attr.dtype.zero_point) \
                                         * input->attr.dtype.zero_point;
            }
        }
    }
-    else
+
    if (bias_data != NULL)
    {
-        VSILOGE("need to process bias - (input_zp * (w - w_zp)) ...");
+        for (i = 0; i < weight->attr.size[1]; i++)
        {
            new_bias_data_ptr[i] += bias_data[i];
        }
    }
    new_bias = vsi_nn_CreateTensorFromData(graph, (uint8_t *)new_bias_data_ptr, &attr);
    vsi_nn_safe_free( new_bias_data_ptr );
    vsi_nn_safe_free( bias_data );
    vsi_nn_safe_free( weight_data );
    return new_bias;
 }
--- a/src/tim/vx/internal/src/kernel/vx/convolutional.c
+++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c
@ -31,6 +31,8 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_node.h"
 #include "vsi_nn_feature.h"
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_graph_optimization.h"
 static vsi_bool _build_vx_conv2d_param
    (
@ -173,6 +175,7 @@ static vx_tensor _expand_tensor_dim
        vsi_nn_kernel_t             * kernel \
        )
 REGISTER_CONV_OPENVX_KERNEL( conv1d )
 {
    vx_node node = NULL;
@ -196,10 +199,34 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
    temp_tensors[0] = _expand_tensor_dim( inputs[0]->t,
            (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 );
    CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final );
    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
    {
        temp_tensors[1] = _expand_tensor_dim( inputs[1]->t,
                (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
        CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final );
    }
    else
    {
        uint8_t    * data = NULL;
        vsi_nn_tensor_attr_t attr;
        uint32_t i;
-    temp_tensors[1] = _expand_tensor_dim( inputs[1]->t,
+        data = vsi_nn_ConvertTensorToData( graph, inputs[1] );
-            (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
+        CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final );
-    CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final );
+
        memcpy(&attr, &inputs[1]->attr, sizeof(vsi_nn_tensor_attr_t));
        attr.size[0] = 1;
        for (i = 1; i <= inputs[1]->attr.dim_num; i++)
        {
            attr.size[i] = inputs[1]->attr.size[i - 1];
        }
        attr.dim_num = inputs[1]->attr.dim_num + 1;
        attr.dtype.channel_dim = inputs[1]->attr.dtype.channel_dim + 1;
        temp_tensors[1] = vsi_nn_CreateRawTensorFromData(graph, data, &attr);
        vsi_nn_safe_free( data );
    }
    temp_tensors[2] = _expand_tensor_dim( outputs[0]->t,
            (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 );
@ -248,9 +275,38 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
            (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 );
    CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final );
-    temp_tensors[1] = _expand_tensor_dim( inputs[1]->t,
+    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
-            (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
+    {
-    CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final );
+        temp_tensors[1] = _expand_tensor_dim( inputs[1]->t,
                (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
        CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final );
    }
    else
    {
        uint8_t    * data = NULL;
        vsi_nn_tensor_attr_t attr;
        uint32_t i;
        data = vsi_nn_ConvertTensorToData( graph, inputs[1] );
        CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final );
        memcpy(&attr, &inputs[1]->attr, sizeof(vsi_nn_tensor_attr_t));
        attr.size[0] = 1;
        attr.size[1] = inputs[1]->attr.size[0];
        attr.size[2] = 1;
        for (i = 1; i < inputs[1]->attr.dim_num; i++)
        {
            attr.size[2] *= inputs[1]->attr.size[i];
        }
        attr.size[3] = 1;
        attr.dim_num = 4;
        attr.dtype.channel_dim = 2;
        temp_tensors[1] = vsi_nn_CreateRawTensorFromData(graph, data, &attr);
        vsi_nn_safe_free( data );
    }
    temp_tensors[2] = _expand_tensor_dim( outputs[0]->t,
            (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 );
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@ -38,27 +38,27 @@ typedef struct _sort_lut_s
    float val;
 } sort_lut;
-static float exp_eval(float val)
+static float exp_eval(float val, float alpha)
 {
    return expf(val);
 }
-static float log_eval(float data)
+static float log_eval(float data, float alpha)
 {
    return logf(data);
 }
-static float elu_eval(float data)
+static float elu_eval(float data, float alpha)
 {
-    return data >=0 ? data : expf(data) - 1;
+    return data >=0 ? data : expf(data) * alpha - alpha;
 }
-static float neg_eval(float data)
+static float neg_eval(float data, float alpha)
 {
    return data * -1.0f;
 }
-static float hsigmoid_eval(float data)
+static float hsigmoid_eval(float data, float alpha)
 {
    data = (float)(0.2 * data + 0.5);
    data = vsi_nn_clamp(data, 0, 1);
@ -66,14 +66,14 @@ static float hsigmoid_eval(float data)
    return data;
 }
-static float soft_plus_eval(float data)
+static float soft_plus_eval(float data, float alpha)
 {
-    return log_eval(exp_eval(data) + 1);
+    return log_eval(exp_eval(data, alpha) + 1, alpha);
 }
-static float mish_eval(float data)
+static float mish_eval(float data, float alpha)
 {
-    data = (float)(data * tanh(soft_plus_eval(data)));
+    data = (float)(data * tanh(soft_plus_eval(data, alpha)));
    return data;
 }
@ -96,7 +96,7 @@ static int32_t _lut_comparator(const void *pa, const void *pb)
    return 0;
 }
-static void _set_unary_table_lookup(float func(float), float *index, float *value)
+static void _set_unary_table_lookup(float func(float, float), float *index, float *value, float alpha)
 {
 #define VSI_NN_MAX_LUT_SIZE     (1024)
 #define FLT16_MAX               (57344)
@ -108,25 +108,25 @@ static void _set_unary_table_lookup(float func(float), float *index, float *valu
    {
        int16_t val = (int16_t)(i << 6);
        lut[i].index = fp16_to_fp32(val);
-        lut[i].val = func(lut[i].index);
+        lut[i].val = func(lut[i].index, alpha);
    }
    for (i = 0x0; i < 0x10; i++)
    {
        lut[i].index = 0;
-        lut[i].val = func(lut[i].index);
+        lut[i].val = func(lut[i].index, alpha);
    }
    for (i = 0x1F0; i < 0x200; i++)
    {
        lut[i].index = FLT16_MAX;
-        lut[i].val = func(lut[i].index);
+        lut[i].val = func(lut[i].index, alpha);
    }
    for (i = 0x3F0; i < 0x400; i++)
    {
        lut[i].index = FLT16_MIN;
-        lut[i].val = func(lut[i].index);
+        lut[i].val = func(lut[i].index, alpha);
    }
    qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator);
@ -154,13 +154,14 @@ static vsi_nn_kernel_node_t _setup
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel,
-    float                      func(float)
+    float                      func(float, float)
    )
 {
 #ifdef VX_USER_LOOKUP_TABLE_SUPPORT
    vx_lut lut1 = NULL;
    vx_lut lut2 = NULL;
    vx_node node = NULL;
    float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
    float index[1024] = {0};
    float value[1024] = {0};
@ -172,7 +173,7 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }
-    _set_unary_table_lookup(func, index, value);
+    _set_unary_table_lookup(func, index, value, alpha);
    lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
    lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
@ -1,12 +1,12 @@
-float4 eltwise_unary_sin(float4 x)
+float4 eltwise_unary_sin(float4 x, float alpha)
 {
    return native_sin(x);
 }
 #define logE        (1.44269502f)
 #define twoLogE     (logE * 2.0f)
-float4 eltwise_unary_exp(float4 x)
+float4 eltwise_unary_exp(float4 x, float alpha)
 {
    x *= logE;
    x = exp2(x);
@ -14,33 +14,33 @@ float4 eltwise_unary_exp(float4 x)
 }
 #define rlogE    (0.693147182f)
-float4 eltwise_unary_log(float4 x)
+float4 eltwise_unary_log(float4 x, float alpha)
 {
    x = log2(x);
    return x * rlogE;
 }
-float4 eltwise_unary_elu(float4 val)
+float4 eltwise_unary_elu(float4 val, float alpha)
 {
    float4 x = val * logE;
-    x = exp2(x) - 1;
+    x = exp2(x) * alpha - alpha;
    return val < 0 ? x : val;
 }
-float4 eltwise_unary_neg(float4 x)
+float4 eltwise_unary_neg(float4 x, float alpha)
 {
    return x * -1;
 }
-float4 eltwise_unary_hard_sigmoid(float4 x)
+float4 eltwise_unary_hard_sigmoid(float4 x, float alpha)
 {
    x = 0.2 * x + 0.5;
    x = clamp(x, 0, 1);
    return x;
 }
-float4 _softrelu(float4 x)
+float4 _softrelu(float4 x, float alpha)
 {
    x *= logE;
    x = exp2(x);
@ -49,7 +49,7 @@ float4 _softrelu(float4 x)
    return x * rlogE;
 }
-float4 _tanh(float4 x)
+float4 _tanh(float4 x, float alpha)
 {
    x *= -twoLogE;
    x = 1 + exp2(x);
@ -57,10 +57,10 @@ float4 _tanh(float4 x)
    return (2 * x - 1);
 }
-float4 eltwise_unary_mish(float4 x)
+float4 eltwise_unary_mish(float4 x, float alpha)
 {
-    float4 y = _softrelu(x);
+    float4 y = _softrelu(x, alpha);
-    x = x * _tanh(y);
+    x = x * _tanh(y, alpha);
    return x;
 }
@ -72,14 +72,15 @@ __kernel void func_name##_F32toF32 \
                 float           inputScale, \
                 float           inputTail, \
                 float           outputScale, \
-                 float           outputZP \
+                 float           outputZP, \
                 float           alpha \
    ) \
 { \
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
 \
    float4 src = read_imagef(input, coord); \
 \
-    float4 dst = eltwise_unary_##func_name(src); \
+    float4 dst = eltwise_unary_##func_name(src, alpha); \
 \
    write_imagef(output, coord, dst); \
 }
@ -99,14 +100,15 @@ __kernel void func_name##_F32toF32_2D \
                 float     inputScale, \
                 float     inputTail, \
                 float     outputScale, \
-                 float     outputZP \
+                 float     outputZP, \
                 float     alpha \
    ) \
 { \
    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \
 \
    float4 src = read_imagef(input, coord); \
 \
-    float4 dst = eltwise_unary_##func_name(src); \
+    float4 dst = eltwise_unary_##func_name(src, alpha); \
 \
    write_imagef(output, coord, dst); \
 }
@ -126,7 +128,8 @@ __kernel void func_name##_U8toU8 \
                 float           inputScale, \
                 float           inputTail, \
                 float           outputScale, \
-                 float           outputZP \
+                 float           outputZP, \
                 float           alpha \
    ) \
 { \
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
@ -134,7 +137,7 @@ __kernel void func_name##_U8toU8 \
    uint4 src = read_imageui(input, coord); \
    float4 data = convert_float4(src) * inputScale - inputTail; \
 \
-    data = eltwise_unary_##func_name(data); \
+    data = eltwise_unary_##func_name(data, alpha); \
    uint4 dst = convert_uint4(data * outputScale + outputZP); \
 \
    write_imageui(output, coord, dst); \
@ -155,7 +158,8 @@ __kernel void func_name##_U8toU8_2D \
                 float     inputScale, \
                 float     inputTail, \
                 float     outputScale, \
-                 float     outputZP \
+                 float     outputZP, \
                 float     alpha \
    ) \
 { \
    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \
@ -163,7 +167,7 @@ __kernel void func_name##_U8toU8_2D \
    uint4 src = read_imageui(input, coord); \
    float4 data = convert_float4(src) * inputScale - inputTail; \
 \
-    data = eltwise_unary_##func_name(data); \
+    data = eltwise_unary_##func_name(data, alpha); \
    uint4 dst = convert_uint4(data * outputScale + outputZP); \
 \
    write_imageui(output, coord, dst); \
@ -184,7 +188,8 @@ __kernel void neg_I32toI32
                 float           inputScale,
                 float           inputTail,
                 float           outputScale,
-                 float           outputZP
+                 float           outputZP,
                 float           alpha
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
@ -202,7 +207,8 @@ __kernel void neg_I32toI32_2D
                 float     inputScale,
                 float     inputTail,
                 float     outputScale,
-                 float     outputZP
+                 float     outputZP,
                 float     alpha
    )
 {
    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
--- a/src/tim/vx/internal/src/libnnext/ops/cl/resize_1d_bilinear.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_1d_bilinear.cl
@ -0,0 +1,57 @@
 __kernel void resize_1d_bilinear_F32toF32(
    __read_only  image2d_array_t  input,
    __write_only image2d_array_t  output,
                           float  scale_x,
                           float  half_pixel_value
                           )
 {
    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    float  in_x         = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;
    float  left_x_f     = floor(in_x);
    float  x_lerp       = in_x - left_x_f;
    int    left_x_idx   = convert_int(left_x_f);
    int4   coord_in     = (int4)(left_x_idx, coord_out.y, coord_out.z, 0);
    float4 top_l, top_r, top, bottom, dst;
    top_l    = read_imagef(input, coord_in);
    coord_in.x++;
    top_r    = read_imagef(input, coord_in);
    top_r    = top_r - top_l;
    dst      = top_l + x_lerp * top_r;
    write_imagef(output, coord_out, dst);
 }
 __kernel void resize_1d_bilinear_U8toU8(
    __read_only  image2d_array_t  input,
    __write_only image2d_array_t  output,
                           float  scale_x,
                           float  half_pixel_value,
                           float  in_scale,
                           float  in_tail,
                           float  out_scale,
                           float  out_tail
                           )
 {
    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    float  in_x         = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;
    float  left_x_f     = floor(in_x);
    float  x_lerp       = in_x - left_x_f;
    int    left_x_idx   = convert_int(left_x_f);
    int4   coord_in     = (int4)(left_x_idx, coord_out.y, coord_out.z, 0);
    float4 top_l, top_r, top;
    uint4  dst;
    top_l    = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail;
    coord_in.x++;
    top_r    = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail;
    top_r    = top_r - top_l;
    top      = top_l + x_lerp * top_r;
    dst      = convert_uint4(top * out_scale + out_tail);
    write_imageui(output, coord_out, dst);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/cl/resize_1d_nearest.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_1d_nearest.cl
@ -0,0 +1,36 @@
 #define NEAREST_INDEX_PROCESS() \
    int4   coord_out  = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
    float  in_x       = (convert_float(coord_out.x) + half_pixel_value) * scale_x + round_value; \
    int    in_x_idx   = convert_int(in_x); \
 __kernel void resize_1d_nearest_F32toF32(
    __read_only  image2d_array_t  input,
    __write_only image2d_array_t  output,
                           float  scale_x,
                           float  half_pixel_value,
                           float  round_value)
 {
    NEAREST_INDEX_PROCESS()
    int4 coord_in = (int4)(in_x_idx, coord_out.y, coord_out.z, 0);
    float4 dst;
    dst    = read_imagef(input, coord_in);
    write_imagef(output, coord_out, dst);
 }
 __kernel void resize_1d_nearest_U8toU8(
    __read_only  image2d_array_t  input,
    __write_only image2d_array_t  output,
                           float  scale_x,
                           float  half_pixel_value,
                           float  round_value,
                           float  output_scale,
                           float  output_tail)
 {
    NEAREST_INDEX_PROCESS()
    int4 coord_in = (int4)(in_x_idx, coord_out.y, coord_out.z, 0);
    uint4 dst;
    dst    = convert_uint4(convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail);
    write_imageui(output, coord_out, dst);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
@ -1,5 +1,7 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform float alpha;
 float4 eltwise_unary_sin(float4 x)
 {
    return native_sin(x);
@ -24,7 +26,7 @@ float4 eltwise_unary_log(float4 x)
 float4 eltwise_unary_elu(float4 val)
 {
    float4 x = val * logE;
-    x = exp2(x) - 1;
+    x = exp2(x) * alpha - alpha;
    return val < 0 ? x : val;
 }
@ -78,7 +80,8 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;
    __kernel void func_name##_##src_type_name##to##dst_type_name##_2D( \
    __read_only  image2d_array_t  input, \
    __write_only image2d_array_t  output, \
-                 int              type \
+                 int              type, \
                 float            _alpha \
    ) \
 { \
    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
@ -194,7 +197,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
    __kernel void func_name##_BF16toBF16_2D( \
    __read_only  image2d_array_t  input, \
    __write_only image2d_array_t  output, \
-                 int              type \
+                 int              type, \
                 float            _alpha \
    ) \
 { \
    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
@ -1,5 +1,7 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform float alpha;
 float4 eltwise_unary_sin(float4 x)
 {
    return native_sin(x);
@ -24,7 +26,7 @@ float4 eltwise_unary_log(float4 x)
 float4 eltwise_unary_elu(float4 val)
 {
    float4 x = val * logE;
-    x = exp2(x) - 1;
+    x = exp2(x) * alpha - alpha;
    return val < 0 ? x : val;
 }
@ -78,7 +80,8 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;
 __kernel void func_name##_##src_type_name##to##dst_type_name( \
    __read_only  image2d_array_t  input, \
    __write_only image2d_array_t  output, \
-                 int              type \
+                 int              type, \
                 float            _alpha \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
@ -192,7 +195,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
    __kernel void func_name##_BF16toBF16( \
    __read_only  image2d_array_t  input, \
    __write_only image2d_array_t  output, \
-                 int              type \
+                 int              type, \
                 float            _alpha \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx
@ -0,0 +1,148 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform float scale_x;
 _viv_uniform int out_height;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
 _viv_uniform float half_pixel_value;
 __kernel void resize_1d_bilinear_BF16toBF16_DOWN
    (
    __read_only     image2d_array_t input,
    __write_only    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x      = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x         = (convert_float4(coord_x) + half_pixel_value)  * scale_x - half_pixel_value;
    float4 left_x_f     = floor(in_x);
    float4 x_lerp       = in_x - left_x_f;
    int4   left_x_idx   = convert_int4(left_x_f);
    vxc_short8 top;
    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.y;
        VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.z;
        VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.w;
        VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
        vxc_ushort8 src;
        float4 left4;
        float4 right4;
        float4 dst4;
        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8);
        _viv_asm(COPY, right4, src, 16);
        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8);
        _viv_asm(COPY, left4,  src, 16);
        right4    -= left4;
        dst4       = right4 * x_lerp + left4;
        vxc_ushort8 tmp, dst;
        _viv_asm(COPY, tmp, dst4, 16);
        dst.s0123 = tmp.s1357;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_in.y++;
        coord_out.y ++;
    } while (coord_out.y < out_height);
 }
 __kernel void resize_1d_bilinear_BF16toBF16_UP
    (
    image2d_array_t input,
    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
    float4 right_x_f   = ceil(in_x);
    int4   right_x_idx = convert_int4(right_x_f);
    vxc_ushort8 src0, src1, dst0;
    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    vxc_ushort8 bitextract_p0;
    vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
    VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);
    vxc_ushort8 constData = 16;
    VXC_DP2x8(maskShift, bitextract_p0, constData, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        coord_in.y ++;
        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        vxc_ushort8 dst_tmp;
        float4 left4;
        float4 right4;
        VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
        _viv_asm(COPY, left4, dst_tmp, 16);
        VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
        _viv_asm(COPY, right4, dst_tmp, 16);
        right4     -= left4;
        float4 dst4 = right4 * x_lerp + left4;
        vxc_ushort8 tmp, dst;
        _viv_asm(COPY, tmp, dst4, 16);
        dst.s0123 = tmp.s1357;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_out.y++;
    } while (coord_out.y < out_height);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_DOWN_NX.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_DOWN_NX.vx
@ -0,0 +1,136 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniResizeNxDown_2x8;
 _viv_uniform int out_height;
 #define RESIZE_1D_NX_DOWN_8BIT_SAME_PROCESS(read_type, data_type) \
    read_type read_data, save_data; \
    data_type in0, result; \
 \
    int8 input_desc; \
    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
    _viv_asm(MOV, coord_in.w, baseAddr); \
 \
    int8 output_desc; \
    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
    _viv_asm(MOV, coord_out.w, baseAddr); \
 \
    while (coord_out.y < out_height) \
    { \
        VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \
                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
        _viv_asm(COPY, in0, read_data, 16); \
        VXC_DP2x8(result, in0, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxDown_2x8); \
        _viv_asm(COPY, save_data, result, 16); \
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \
            VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
        coord_in.y++; \
        coord_out.y++; \
    } \
 #define RESIZE_1D_2X_DOWN_8BIT_HALF_SAME(name0, name1, read_type, data_type) \
 __kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_HALF_SAME \
    ( \
    __read_only  image2d_array_t   input, \
    __write_only image2d_array_t   output, \
                             int   scale_type \
    ) \
 { \
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    int4 coord_in   = (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    coord_in.x =  coord_out.x << 1; \
    RESIZE_1D_NX_DOWN_8BIT_SAME_PROCESS(read_type, data_type) \
 }
 RESIZE_1D_2X_DOWN_8BIT_HALF_SAME(U8,  U8,  vxc_uchar16, vxc_uchar16)
 RESIZE_1D_2X_DOWN_8BIT_HALF_SAME(I8,  I8,  vxc_char16,  vxc_char16)
 #define RESIZE_1D_2X_DOWN_8BIT_SAME(name0, name1, read_type, data_type) \
 __kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_SAME \
    ( \
    __read_only  image2d_array_t   input, \
    __write_only image2d_array_t   output, \
                             int   scale_type \
    ) \
 { \
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    int4 coord_in   = (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    coord_in.x =  coord_out.x << 1; \
    RESIZE_1D_NX_DOWN_8BIT_SAME_PROCESS(read_type, data_type) \
 }
 RESIZE_1D_2X_DOWN_8BIT_SAME(U8,  U8,  vxc_uchar16, vxc_uchar16)
 RESIZE_1D_2X_DOWN_8BIT_SAME(I8,  I8,  vxc_char16,  vxc_char16)
 #define RESIZE_1D_NX_DOWN_16BIT_SAME_PROCESS(read_type, data_type) \
    read_type read_data, read_data1, save_data; \
    data_type in0, in1, result; \
 \
    int8 input_desc; \
    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
    _viv_asm(MOV, coord_in.w, baseAddr); \
 \
    int8 output_desc; \
    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
    _viv_asm(MOV, coord_out.w, baseAddr); \
 \
    while (coord_out.y < out_height) \
    { \
        VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
        _viv_asm(COPY, in0, read_data, 16); \
        VXC_OP4(img_load_3d, read_data1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), \
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
        _viv_asm(COPY, in1, read_data1, 16); \
        VXC_DP2x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxDown_2x8); \
        _viv_asm(COPY, save_data, result, 16); \
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \
            VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
        coord_in.y++; \
        coord_out.y++; \
    } \
 #define RESIZE_1D_2X_DOWN_16BIT_HALF_SAME(name0, name1, read_type, data_type) \
 __kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_HALF_SAME \
    ( \
    __read_only  image2d_array_t   input, \
    __write_only image2d_array_t   output, \
                             int   scale_type \
    ) \
 { \
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    int4 coord_in   = (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    coord_in.x =  coord_out.x << 1; \
    RESIZE_1D_NX_DOWN_16BIT_SAME_PROCESS(read_type, data_type) \
 }
 RESIZE_1D_2X_DOWN_16BIT_HALF_SAME(I16, I16, vxc_short8,  vxc_short8)
 RESIZE_1D_2X_DOWN_16BIT_HALF_SAME(F16, F16, vxc_short8,  vxc_half8)
 #define RESIZE_1D_2X_DOWN_16BIT_SAME(name0, name1, read_type, data_type) \
 __kernel void resize_1d_bilinear_##name0##to##name1##_DOWN_2X_SAME \
    ( \
    __read_only  image2d_array_t   input, \
    __write_only image2d_array_t   output, \
                             int   scale_type \
    ) \
 { \
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    int4 coord_in   = (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    coord_in.x =  coord_out.x << 1; \
    RESIZE_1D_NX_DOWN_16BIT_SAME_PROCESS(read_type, data_type) \
 }
 RESIZE_1D_2X_DOWN_16BIT_SAME(I16, I16, vxc_short8,  vxc_short8)
 RESIZE_1D_2X_DOWN_16BIT_SAME(F16, F16, vxc_short8,  vxc_half8)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx
@ -0,0 +1,216 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;
 _viv_uniform VXC_512Bits uniFp16toFp32_4x4;
 _viv_uniform VXC_512Bits uniRightSubLeft_4x4;
 _viv_uniform VXC_512Bits uniExtactHalf8_2x8;
 _viv_uniform float scale_x;
 _viv_uniform int out_height;
 _viv_uniform float uint8Scale;
 _viv_uniform float output_ZP;
 _viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
 _viv_uniform float half_pixel_value;
 _viv_uniform VXC_512Bits uniConvertFp2FP32_left_4x4;
 _viv_uniform VXC_512Bits uniConvertFp2FP32_right_4x4;
 __kernel void resize_1d_bilinear_F16toF16_DOWN
    (
    __read_only     image2d_array_t input,
    __write_only    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
    float4 left4;
    float4 right4;
    vxc_ushort8 src, result;
    vxc_half8 src_half, dst;
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.y;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.z;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.w;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
        _viv_asm(COPY, src_half, src, 16);
        VXC_DP4x4(left4,  src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);
        VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);
        right4      -= left4;
        float4 dst4  = right4 * x_lerp + left4;
        half4 tmp;
        _viv_asm(CONV, tmp, dst4);
        VXC_DP2x8(dst, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
        _viv_asm(COPY, result, dst, 16);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_in.y++;
        coord_out.y ++;
    } while (coord_out.y < out_height);
 }
 __kernel void resize_1d_bilinear_F16toU8_DOWN
    (
    __read_only     image2d_array_t input,
    __write_only    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
    float4 left4;
    float4 right4;
    vxc_ushort8 src;
    vxc_uchar8 result;
    vxc_half8 src_half, dst;
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.y;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.z;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.w;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
        _viv_asm(COPY, src_half, src, 16);
        VXC_DP4x4(left4,  src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);
        VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);
        right4      -= left4;
        float4 dst4  = right4 * x_lerp + left4;
        dst4         = dst4 * uint8Scale + output_ZP;
        int4 dst     = convert_int4_rte(dst4);
        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_in.y++;
        coord_out.y ++;
    } while (coord_out.y < out_height);
 }
 __kernel void resize_1d_bilinear_F16toF16_UP
    (
    image2d_array_t input,
    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
    float4 right_x_f   = ceil(in_x);
    int4   right_x_idx = convert_int4(right_x_f);
    vxc_ushort8 src0, src1, dst0;
    vxc_half8 top;
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    vxc_ushort8 bitextract_p0;
    vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
    VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);
    vxc_ushort8 constData = 16;
    VXC_DP2x8(maskShift, bitextract_p0, constData, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        _viv_asm(COPY, top, dst0, 16);
        coord_in.y ++;
        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        float4 left4;
        float4 right4;
        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
        float4 dst4  = right4 * x_lerp + left4;
        half4 tmp;
        _viv_asm(CONV, tmp, dst4);
        VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
        _viv_asm(COPY, dst0, top, 16);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_out.y++;
    } while (coord_out.y < out_height);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx
@ -0,0 +1,147 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;
 _viv_uniform float scale_x;
 _viv_uniform int out_height;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
 _viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;
 _viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;
 _viv_uniform float dfpScale;
 _viv_uniform float half_pixel_value;
 _viv_uniform VXC_512Bits uniConvertDFP2FP32_left_4x4;
 _viv_uniform VXC_512Bits uniConvertDFP2FP32_right_4x4;
 __kernel void resize_1d_bilinear_I16toI16_UP
    (
    image2d_array_t input,
    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
    float4 right_x_f   = ceil(in_x);
    int4   right_x_idx = convert_int4(right_x_f);
    vxc_ushort8 src0, src1, dst0;
    vxc_short8 top;
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    vxc_ushort8 bitextract_p0;
    vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
    VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);
    vxc_ushort8 constData = 16;
    VXC_DP2x8(maskShift, bitextract_p0, constData, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        _viv_asm(COPY, top, dst0, 16);
        float4 left4;
        float4 right4;
        coord_in.y ++;
        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
        float4 dst4  = right4 * x_lerp + left4;
        dst4         = dst4 * dfpScale;
        int4 dst     = convert_int4_rte(dst4);
        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_out.y ++;
    } while (coord_out.y < out_height);
 }
 __kernel void resize_1d_bilinear_I16toI16_DOWN
    (
    image2d_array_t input,
    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
    vxc_short8 src;
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    float4 left4;
    float4 right4;
    vxc_short8 result;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.y;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.z;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.w;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
        VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4);
        VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_right_4x4);
        right4      -= left4;
        float4 dst4  = right4 * x_lerp + left4;
        dst4         = dst4 * dfpScale;
        int4 dst     = convert_int4_rte(dst4);
        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_in.y++;
        coord_out.y ++;
    } while (coord_out.y < out_height);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx
@ -0,0 +1,148 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;
 _viv_uniform float scale_x;
 _viv_uniform int out_height;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
 _viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;
 _viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;
 _viv_uniform float dfpScale;
 _viv_uniform float half_pixel_value;
 _viv_uniform VXC_512Bits uniConvertDFP2FP32_left_4x4;
 _viv_uniform VXC_512Bits uniConvertDFP2FP32_right_4x4;
 __kernel void resize_1d_bilinear_I8toI8_UP
    (
    image2d_array_t input,
    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
    float4 right_x_f   = ceil(in_x);
    int4   right_x_idx = convert_int4(right_x_f);
    vxc_uchar16 src0, dst0;
    vxc_char16 top;
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    vxc_ushort8 bitextract_p0;
    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
    VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);
    vxc_ushort8 constData = 8;
    VXC_DP2x8(maskShift, bitextract_p0, constData, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        _viv_asm(COPY, top, dst0, 16);
        coord_in.y++;
        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        float4 left4;
        float4 right4;
        VXC_DP4x4(left4, top, top, \
        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
        VXC_DP4x4(right4, top, top, \
        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
        float4 dst4  = right4 * x_lerp + left4;
        dst4         = dst4 * dfpScale;
        int4 dst     = convert_int4_rte(dst4);
        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_out.y ++;
    } while (coord_out.y < out_height);
 }
 __kernel void resize_1d_bilinear_I8toI8_DOWN
    (
    image2d_array_t input,
    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
    vxc_char16 src;
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    float4 left4;
    float4 right4;
    vxc_char16 result;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.y;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.z;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.w;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
        VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4);
        VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_right_4x4);
        right4      -= left4;
        float4 dst4  = right4 * x_lerp + left4;
        dst4         = dst4 * dfpScale;
        int4 dst     = convert_int4_rte(dst4);
        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_in.y++;
        coord_out.y ++;
    } while (coord_out.y < out_height);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx
@ -0,0 +1,212 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniU8SubZPtoFp32_4x4;
 _viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
 _viv_uniform VXC_512Bits uniU8SubZPtoFp32_right_4x4;
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;
 _viv_uniform float scale_x;
 _viv_uniform int out_height;
 _viv_uniform int input_ZP;
 _viv_uniform float uint8Scale;
 _viv_uniform float output_ZP;
 _viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
 _viv_uniform float half_pixel_value;
 __kernel void resize_1d_bilinear_U8toF16_DOWN
    (
    __read_only     image2d_array_t input,
    __write_only    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
    vxc_uchar16 src;
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    float4 left4;
    float4 right4;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    unsigned char inputZP;
    _viv_asm(COPY, inputZP, input_ZP, 4);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.y;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.z;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.w;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
        VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
        VXC_DP4x4(right4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_right_4x4);
        right4      -= left4;
        float4 dst4  = right4 * x_lerp + left4;
        dst4 *=  uint8Scale;
        half4 dst;
        _viv_asm(CONV, dst, dst4);
        vxc_short8 dst_short;
        _viv_asm(COPY, dst_short, dst, 16);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_in.y++;
        coord_out.y ++;
    } while (coord_out.y < out_height);
 }
 __kernel void resize_1d_bilinear_U8toU8_UP
    (
    image2d_array_t input,
    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
    float4 right_x_f   = ceil(in_x);
    int4   right_x_idx = convert_int4(right_x_f);
    vxc_uchar16 src0, src1;
    vxc_uchar16 top;
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    vxc_ushort8 bitextract_p0;
    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
    VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);
    vxc_ushort8 constData = 8;
    VXC_DP2x8(maskShift, bitextract_p0, constData, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        coord_in.y++;
        VXC_OP4(img_load_3d, src0, input, coord_in.xyww,
                VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        float4 left4;
        float4 right4;
        unsigned char inputZP;
        _viv_asm(COPY, inputZP, input_ZP, 4);
        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
        float4 dst4 = right4 * x_lerp + left4;
        dst4         = dst4 * uint8Scale + output_ZP;
        int4 dst     = convert_int4_rte(dst4);
        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_out.y ++;
    } while (coord_out.y < out_height);
 }
 __kernel void resize_1d_bilinear_U8toU8_DOWN
    (
    __read_only     image2d_array_t input,
    __write_only    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
    vxc_uchar16 src;
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    float4 left4;
    float4 right4;
    vxc_uchar16 result;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    unsigned char inputZP;
    _viv_asm(COPY, inputZP, input_ZP, 4);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.y;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.z;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
        coord_in.x = left_x_idx.w;
        VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
        VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
        VXC_DP4x4(right4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_right_4x4);
        right4      -= left4;
        float4 dst4  = right4 * x_lerp + left4;
        dst4         = dst4 * uint8Scale + output_ZP;
        int4 dst     = convert_int4_rte(dst4);
        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_in.y++;
        coord_out.y ++;
    } while (coord_out.y < out_height);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8_opt.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8_opt.vx
@ -0,0 +1,78 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform float scale_x;
 _viv_uniform int out_height;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
 _viv_uniform VXC_512Bits uniBilinear_4x4;
 _viv_uniform float half_pixel_value;
 __kernel void resize_1d_bilinear_U8toU8_UP_opt
    (
    __read_only  image2d_array_t   input,
    __write_only image2d_array_t   output,
                             int   align_corners,
                             int   half_pixel_centers,
    __read_only  image2d_array_t   scale
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(2), 0);
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
    float4 left_x_f    = floor(in_x);
    int4   left_x_idx  = convert_int4(left_x_f);
    int4   right_x_idx = left_x_idx + 1;
    vxc_uchar16 src0;
    vxc_uchar16 src_mask;
    int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    vxc_ushort8 bitextract_p0;
    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
    VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx,
              VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);
    vxc_ushort8 constData = 8;
    VXC_DP2x8(maskShift, bitextract_p0, constData,
              VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
    vxc_ushort8 lerp_0;
    vxc_half8 lerp;
    int2 coord = (int2)(coord_out.x * 2, 0);
    VXC_ReadImage(lerp_0, scale, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, lerp, lerp_0, 16);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    do
    {
        VXC_BitExtract(src_mask, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        coord_in.y++;
        VXC_OP4(img_load_3d, src0, input, coord_in.xyww,
                VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        vxc_uchar16 dst;
        VXC_DP4x4(dst, src_mask, lerp,
                VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_out.y ++;
    } while (coord_out.y < out_height);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_UP_NX.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_UP_NX.vx
@ -0,0 +1,155 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniResizeNxUp_2x8;
 _viv_uniform int out_height;
 #define RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \
    read_type read_data, save_data; \
    data_type in0, result; \
 \
    int8 input_desc; \
    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
    _viv_asm(MOV, coord_in.w, baseAddr); \
 \
    int8 output_desc; \
    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
    _viv_asm(MOV, coord_out.w, baseAddr); \
 \
    while (coord_out.y < out_height) \
    { \
        VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
        _viv_asm(COPY, in0, read_data, 16); \
        VXC_DP2x8(result, in0, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxUp_2x8); \
        _viv_asm(COPY, save_data, result, 16); \
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \
            VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
        coord_in.y++; \
        coord_out.y++; \
    } \
 #define RESIZE_1D_2X_HALF_SAME(name0, name1, read_type, data_type) \
 __kernel void resize_1d_bilinear_##name0##to##name1##_UP_2X_HALF_SAME \
    ( \
    __read_only  image2d_array_t   input, \
    __write_only image2d_array_t   output, \
                             int   scale_type \
    ) \
 { \
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    int4 coord_in   = (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    coord_in.x = (coord_out.x * 2 - 1) >> 2; \
    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x; \
    RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \
 }
 RESIZE_1D_2X_HALF_SAME(U8,  U8,  vxc_uchar16, vxc_uchar16)
 RESIZE_1D_2X_HALF_SAME(I8,  I8,  vxc_char16,  vxc_char16)
 RESIZE_1D_2X_HALF_SAME(I16, I16, vxc_short8,  vxc_short8)
 RESIZE_1D_2X_HALF_SAME(F16, F16, vxc_short8,  vxc_half8)
 #define RESIZE_1D_2X_SAME(name0, name1, read_type, data_type) \
 __kernel void resize_1d_bilinear_##name0##to##name1##_UP_2X_SAME \
    ( \
    __read_only  image2d_array_t   input, \
    __write_only image2d_array_t   output, \
                             int   scale_type \
    ) \
 { \
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    int4 coord_in   = (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    coord_in.x =  coord_out.x >> 1; \
    RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \
 }
 RESIZE_1D_2X_SAME(U8,  U8,  vxc_uchar16, vxc_uchar16)
 RESIZE_1D_2X_SAME(I8,  I8,  vxc_char16,  vxc_char16)
 RESIZE_1D_2X_SAME(I16, I16, vxc_short8,  vxc_short8)
 RESIZE_1D_2X_SAME(F16, F16, vxc_short8,  vxc_half8)
 #define RESIZE_1D_4X_HALF_SAME(name0, name1, read_type, data_type) \
 __kernel void resize_1d_bilinear_##name0##to##name1##_UP_4X_HALF_SAME \
    ( \
    __read_only  image2d_array_t   input, \
    __write_only image2d_array_t   output, \
                             int   scale_type \
    ) \
 { \
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    int4 coord_in   = (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    coord_in.x = (coord_out.x * 2 - 3) >> 3; \
    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x; \
    RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \
 }
 RESIZE_1D_4X_HALF_SAME(U8,  U8,  vxc_uchar16, vxc_uchar16)
 RESIZE_1D_4X_HALF_SAME(I8,  I8,  vxc_char16,  vxc_char16)
 RESIZE_1D_4X_HALF_SAME(I16, I16, vxc_short8,  vxc_short8)
 RESIZE_1D_4X_HALF_SAME(F16, F16, vxc_short8,  vxc_half8)
 #define RESIZE_1D_4X_SAME(name0, name1, read_type, data_type) \
 __kernel void resize_1d_bilinear_##name0##to##name1##_UP_4X_SAME \
    ( \
    __read_only  image2d_array_t   input, \
    __write_only image2d_array_t   output, \
                             int   scale_type \
    ) \
 { \
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    int4 coord_in   = (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    coord_in.x =  coord_out.x >> 2; \
    RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \
 }
 RESIZE_1D_4X_SAME(U8,  U8,  vxc_uchar16, vxc_uchar16)
 RESIZE_1D_4X_SAME(I8,  I8,  vxc_char16,  vxc_char16)
 RESIZE_1D_4X_SAME(I16, I16, vxc_short8,  vxc_short8)
 RESIZE_1D_4X_SAME(F16, F16, vxc_short8,  vxc_half8)
 #define RESIZE_1D_8X_HALF_SAME(name0, name1, read_type, data_type) \
 __kernel void resize_1d_bilinear_##name0##to##name1##_UP_8X_HALF_SAME \
    ( \
    __read_only  image2d_array_t   input, \
    __write_only image2d_array_t   output, \
                             int   scale_type \
    ) \
 { \
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    int4 coord_in   = (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    coord_in.x = (coord_out.x * 2 - 7) >> 4; \
    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x; \
    RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \
 }
 RESIZE_1D_8X_HALF_SAME(U8,  U8,  vxc_uchar16, vxc_uchar16)
 RESIZE_1D_8X_HALF_SAME(I8,  I8,  vxc_char16,  vxc_char16)
 RESIZE_1D_8X_HALF_SAME(I16, I16, vxc_short8,  vxc_short8)
 RESIZE_1D_8X_HALF_SAME(F16, F16, vxc_short8,  vxc_half8)
 #define RESIZE_1D_8X_SAME(name0, name1, read_type, data_type) \
 __kernel void resize_1d_bilinear_##name0##to##name1##_UP_8X_SAME \
    ( \
    __read_only  image2d_array_t   input, \
    __write_only image2d_array_t   output, \
                             int   scale_type \
    ) \
 { \
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    int4 coord_in   = (int4)(get_global_id(0), 0, get_global_id(1), 0); \
    coord_in.x =  coord_out.x >> 3; \
    RESIZE_1D_NX_SAME_PROCESS(read_type, data_type) \
 }
 RESIZE_1D_8X_SAME(U8,  U8,  vxc_uchar16, vxc_uchar16)
 RESIZE_1D_8X_SAME(I8,  I8,  vxc_char16,  vxc_char16)
 RESIZE_1D_8X_SAME(I16, I16, vxc_short8,  vxc_short8)
 RESIZE_1D_8X_SAME(F16, F16, vxc_short8,  vxc_half8)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_nearest.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_nearest.vx
@ -0,0 +1,337 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;
 _viv_uniform float scale_x;
 _viv_uniform float half_pixel_value;
 _viv_uniform float round_value;
 _viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
 #define NEAREST_INDEX_PROCESS() \
    int4   coord_out  = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
    int4   coord_x    = coord_out.xxxx + (int4)(0, 1, 2, 3); \
    float4 in_x       = (convert_float4(coord_x) + half_pixel_value) * scale_x + round_value; \
    int4   in_x_idx   = convert_int4(in_x); \
 __kernel void resize_1d_nearest_F16toF16
    (
    __read_only  image2d_array_t input,
    __write_only image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    NEAREST_INDEX_PROCESS()
    vxc_short8 src;
    int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.y;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.z;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.w;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
            VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 _viv_uniform VXC_512Bits uniGetExtractData_2x8;
 __kernel void resize_1d_nearest_F16toF16_op
    (
    __read_only  image2d_array_t input,
    __write_only image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    NEAREST_INDEX_PROCESS()
    vxc_ushort8 src0, src1, dst;
    int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);
    vxc_ushort8 input_idx;
    _viv_asm(COPY, input_idx, in_x_idx, 16);
    VXC_DP2x8(mask, input_idx, input_idx, \
    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);
    VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
            VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 _viv_uniform VXC_512Bits uniConvertI8toI8_2x8;
 __kernel void resize_1d_nearest_I8toI8
    (
    __read_only  image2d_array_t input,
    __write_only image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    NEAREST_INDEX_PROCESS()
    vxc_char16 src;
    int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.y;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.z;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.w;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
    VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
            VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 __kernel void resize_1d_nearest_I8toI8_op
    (
    __read_only  image2d_array_t input,
    __write_only image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    NEAREST_INDEX_PROCESS()
    vxc_uchar16 src0, dst0;
    vxc_char16 dst;
    int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);
    vxc_ushort8 input_idx;
    _viv_asm(COPY, input_idx, in_x_idx, 16);
    VXC_DP2x8(mask, input_idx, input_idx, \
    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);
    VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, dst, dst0, 8);
    VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
            VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 __kernel void resize_1d_nearest_U8toU8
    (
    image2d_array_t input,
    image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    NEAREST_INDEX_PROCESS()
    vxc_uchar16 src;
    int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.y;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.z;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.w;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
    vxc_ushort8 multiplier;
    _viv_asm(COPY, multiplier, multAndoutZP, 16);
    VXC_DP2x8(src, src, multiplier, \
    VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
            VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 __kernel void resize_1d_nearest_U8toU8_op
    (
    __read_only  image2d_array_t input,
    __write_only image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    NEAREST_INDEX_PROCESS()
    vxc_uchar16 src0, dst;
    int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);
    vxc_ushort8 input_idx;
    _viv_asm(COPY, input_idx, in_x_idx, 16);
    VXC_DP2x8(mask, input_idx, input_idx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);
    VXC_BitExtract(dst, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
    vxc_ushort8 multiplier;
    _viv_asm(COPY, multiplier, multAndoutZP, 16);
    VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
            VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 __kernel void resize_1d_nearest_I16toI16
    (
    __read_only  image2d_array_t input,
    __write_only image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    NEAREST_INDEX_PROCESS()
    vxc_short8 src;
    int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.y;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.z;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.w;
    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
    VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
            VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 __kernel void resize_1d_nearest_I16toI16_op
    (
    __read_only  image2d_array_t input,
    __write_only image2d_array_t output,
    int   align_corners,
    int   half_pixel_centers
    )
 {
    NEAREST_INDEX_PROCESS()
    vxc_ushort8 src0, src1, dst0;
    vxc_short8 dst;
    int4 coord_in = (int4)(in_x_idx.x, coord_out.y, coord_out.z, 0);
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);
    vxc_ushort8 input_idx;
    _viv_asm(COPY, input_idx, in_x_idx, 16);
    VXC_DP2x8(mask, input_idx, input_idx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);
    VXC_BitExtract(dst0, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, dst, dst0, 8);
    VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
            VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
@ -176,8 +176,8 @@ static vsi_status op_optimize
        reshape 3d input (xcn) --> 4d input (whcn)
        reshape 3d output(xcn) --> 4d output(whcn)
    */
-    shape[0] = inputs[0]->attr.size[0];
+    shape[0] = 1;
-    shape[1] = 1;
+    shape[1] = inputs[0]->attr.size[0];
    shape[2] = inputs[0]->attr.size[1];
    shape[3] = inputs[0]->attr.size[2];
    dim = 4;
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
@ -136,6 +136,7 @@ static vsi_bool op_check
        IO_TYPE(D_I32,  D_F32)
        IO_TYPE(D_I32,  D_I32)
        IO_TYPE(D_I32,  D_U32)
        IO_TYPE(D_I32,  D_F16)
        IO_TYPE(D_I32,  D_BOOL8)
        IO_TYPE(D_U32,  D_F32)
        IO_TYPE(D_U32,  D_I32)
@ -176,6 +177,7 @@ static vsi_bool op_check
        IO_TYPE(D_U8|Q_ASYM,  D_F32)
        IO_TYPE(D_U8|Q_ASYM,  D_I32)
        IO_TYPE(D_BF16,   D_BF16)
        IO_TYPE(D_U8,   D_F16)
    END_IO_TYPE_DECL(CAST)
    if(!VALIDATE_OP_IO_TYPES(CAST, self, inputs, self->input.num, outputs, self->output.num)) {
        char* desc = generate_op_io_types_desc(inputs,
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
@ -37,6 +37,29 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 static vsi_bool _enable_concat_optimize()
 {
    char *envctrl;
    static int32_t enableOptimize = -1;
    if (enableOptimize == -1)
    {
        enableOptimize = 1;
        envctrl = getenv("VSI_NN_ENABLE_CONCAT_OPTIMIZE");
        if (envctrl)
        {
            enableOptimize = atoi(envctrl);
        }
    }
    if (enableOptimize == 1)
    {
        return TRUE;
    }
    return FALSE;
 }
 static int32_t _get_input_num
    (
    vsi_nn_node_t   * self,
@ -243,7 +266,8 @@ static vsi_status op_compute
    status = VSI_SUCCESS;
    self->n = NULL;
-    if(_is_highest_dimension(self, outputs) && _is_same_quant(self, inputs, outputs))
+    if(_is_highest_dimension(self, outputs) && _is_same_quant(self, inputs, outputs)
        && _enable_concat_optimize())
    {
        iter = self->nn_param.concat.lcl_data;
        while( NULL != iter )
@ -397,7 +421,8 @@ static vsi_status op_optimize
    status = VSI_SUCCESS;
    /* we don't create tensor view if the axis is not the highest dimension */
    if (_is_highest_dimension(self, outputs) == FALSE ||
-        _is_same_quant(self, inputs, outputs) == FALSE)
+        _is_same_quant(self, inputs, outputs) == FALSE ||
        _enable_concat_optimize() == FALSE)
    {
        return status;
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@ -194,6 +194,7 @@ static vsi_bool op_check
        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
        IO_TYPE(D_I16|Q_DFP,  D_I8|Q_DFP)
        IO_TYPE(D_I16|Q_DFP,  D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_DFP,  D_F16)
        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_ASYM)
        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM)
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c
@ -31,6 +31,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 /*
@ -73,6 +74,7 @@ static vsi_status op_compute
    {
        status = VSI_SUCCESS;
    }
    vsi_nn_kernel_param_release( &param );
    return status;
 } /* op_compute() */
@ -119,7 +121,7 @@ static vsi_bool op_setup
            VSI_NN_ROUND_FLOOR
            );
-        outputs[0]->attr.size[1] = inputs[1]->attr.size[2];
+        outputs[0]->attr.size[1] = inputs[0]->attr.size[1] * p->multiplier;
        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
@ -45,22 +45,30 @@ static vsi_status _eltwise_unary_op_compute
    )
 {
    vsi_status status = VSI_FAILURE;
    float alpha = 0;
    vsi_nn_kernel_param_t * param = NULL;
    if( NULL == self )
    {
        return status;
    }
    param = vsi_nn_kernel_param_create();
    alpha = self->nn_param.elu.alpha;
    vsi_nn_kernel_param_add_float32( param, "alpha", alpha );
    // TODO: This optimzie is a hack for gpu path,
    // it should be moved to gpu kernel setup.
    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
-        kernel_name, inputs, 1, outputs, 1, NULL );
+        kernel_name, inputs, 1, outputs, 1, param );
    if( self->n )
    {
        status = VSI_SUCCESS;
    }
    vsi_nn_kernel_param_release( &param );
    return status;
 } /* _eltwise_op_compute() */
@ -152,6 +160,19 @@ static vsi_bool op_check
    return TRUE;
 } /* op_check() */
 static vsi_status op_init
    (
    vsi_nn_node_t * self
    )
 {
    if (vsi_nn_compareVersion(self->graph, 1, 1, 29) == -1)
    {
        self->nn_param.elu.alpha = 1;
    }
    return VSI_SUCCESS;
 } /* op_init() */
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -166,7 +187,7 @@ extern "C" {
    { \
        return _eltwise_unary_op_compute( ""#kernel_name, self, inputs, outputs ); \
    } \
-DEF_OP_REG(name, NULL, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_check, op_setup, NULL, 2, 1)
+DEF_OP_REG(name, op_init, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_check, op_setup, NULL, 2, 1)
 DEF_ELEMENT_WISE_UNARY_OP( SIN, sin );
 DEF_ELEMENT_WISE_UNARY_OP( EXP, exp );
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c
@ -73,6 +73,7 @@ static vsi_bool op_check
        IO_TYPE(D_I32, D_I32,  D_I32, D_F16, D_I32)
        IO_TYPE(D_I32, D_I32,  D_F32, D_F16, D_F32)
        IO_TYPE(D_I32, D_I32,  D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
        IO_TYPE(D_I32, D_I32,  D_F32, D_F32, D_U8|Q_ASYM)
    END_IO_TYPE_DECL(HASHTABLE_LOOKUP)
    if (!VALIDATE_OP_IO_TYPES(HASHTABLE_LOOKUP, self, inputs, self->input.num, outputs, self->output.num))
    {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c
@ -0,0 +1,298 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 /*
 Declare number of input and output.
 */
 #define _INPUT_NUM          (2)
 #define _OUTPUT_NUM         (1)
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_status status = VSI_SUCCESS;
    status = vsi_nn_internal_compute_node( self );
    return status;
 } /* op_compute() */
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_nn_interp_param *p = NULL;
    p = &self->nn_param.interp;
    if ((p->pad_beg > 0) || (p->pad_end > 0))
    {
        VSILOGE("Only supports non-pos padding (cropping) for now ");
        return FALSE;
    }
    return TRUE;
 } /* op_check() */
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_nn_interp_param *p = NULL;
    int32_t height_in_eff_, width_in_eff_;
    int32_t height_out, width_out;
    vsi_nn_internal_node_t* curr = NULL;
    vsi_nn_internal_tensor_t *crop_tensor = NULL;
    vsi_nn_tensor_t *crop_in_tensor = NULL;
    float factor = 1.0f;
    int32_t pad_beg = 0;
    int32_t pad_end = 0;
    if ( NULL == self )
    {
        return FALSE;
    }
    p = &self->nn_param.interp;
    pad_beg = -p->pad_beg;
    pad_end = -p->pad_end;
    width_in_eff_  = inputs[0]->attr.size[0] + p->pad_beg + p->pad_end;
    height_in_eff_ = inputs[0]->attr.size[1] + p->pad_beg + p->pad_end;
    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
        memcpy( outputs[0]->attr.size, inputs[0]->attr.size,
            VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) );
        if ((p->shrink_factor > 0) && (p->zoom_factor <= 0))
        {
            width_out  = (width_in_eff_ - 1) / p->shrink_factor + 1;
            height_out = (height_in_eff_ - 1) / p->shrink_factor + 1;
        }
        else if ((p->zoom_factor > 0) && (p->shrink_factor <= 0))
        {
            width_out  = (width_in_eff_ - 1) * (p->zoom_factor - 1) + width_in_eff_;
            height_out = (height_in_eff_ - 1) * (p->zoom_factor - 1) + height_in_eff_;
        }
        else if ((p->height > 0) && (p->width > 0))
        {
            width_out  = p->width;
            height_out = p->height;
        }
        else if ((p->zoom_factor > 0) && (p->shrink_factor > 0))
        {
            width_out  = (width_in_eff_ - 1) / p->shrink_factor + 1;
            height_out = (height_in_eff_ - 1) / p->shrink_factor + 1;
            width_out  = (width_out - 1) * (p->zoom_factor - 1) + width_out;
            height_out = (height_out - 1) * (p->zoom_factor - 1) + height_out;
        }
        else if (NULL != inputs[1])
        {
            width_out  = inputs[1]->attr.size[0];
            height_out = inputs[1]->attr.size[1];
        }
        else
        {
            VSILOGE("Not support params ");
            return FALSE;
        }
        if ((width_out < 0) || (height_out < 0) || (width_in_eff_ < 0) || (height_in_eff_ < 0))
        {
            VSILOGE("value shoud be positive: width_out %d height_out %d width_in_eff_ %d height_in_eff_ %d ",
                    width_out, height_out, width_in_eff_, height_in_eff_);
            return FALSE;
        }
        outputs[0]->attr.size[0] = width_out;
        outputs[0]->attr.size[1] = height_out;
    }
    factor = (float)(outputs[0]->attr.size[0]) / (float)(width_in_eff_);
    if ((pad_beg > 0) || (pad_end > 0))
    {
        vsi_nn_tensor_attr_t attr;
        int32_t use_virtual_tensor = 1;
        int32_t *begin_dims;
        int32_t *end_dims;
        int32_t *stride_dims;
        uint32_t i;
        memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
        vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor);
        crop_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
        crop_in_tensor = crop_tensor->t;
        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 1, 1 );
        curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num;
        curr->node->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num;
        curr->node->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num;
        curr->node->nn_param.strided_slice.begin_mask = 0;
        curr->node->nn_param.strided_slice.end_mask = 0;
        curr->node->nn_param.strided_slice.shrink_axis_mask = 0;
        begin_dims = (int32_t *)vsi_nn_internal_new_node_param(curr,
            VSI_NN_MAX_DIM_NUM * sizeof(uint32_t));
        end_dims   = (int32_t *)vsi_nn_internal_new_node_param(curr,
            VSI_NN_MAX_DIM_NUM * sizeof(uint32_t));
        stride_dims  = (int32_t *)vsi_nn_internal_new_node_param(curr,
            VSI_NN_MAX_DIM_NUM * sizeof(uint32_t));
        for (i = 0; i < inputs[0]->attr.dim_num; i++)
        {
            stride_dims[i] = 1;
        }
        begin_dims[0] = pad_beg;
        begin_dims[1] = pad_beg;
        end_dims[0]   = inputs[0]->attr.size[0] - pad_end;
        end_dims[1]   = inputs[0]->attr.size[1] - pad_end;
        if (inputs[0]->attr.dim_num > 2)
        {
            for (i = 2 ; i < inputs[0]->attr.dim_num; i++)
            {
                begin_dims[i] = 0;
                end_dims[i]   = inputs[0]->attr.size[i];
            }
        }
        curr->node->nn_param.strided_slice.begin_dims = begin_dims;
        curr->node->nn_param.strided_slice.end_dims = end_dims;
        curr->node->nn_param.strided_slice.stride_dims = stride_dims;
        curr->inputs[0]  = inputs[0];
        curr->outputs[0] = crop_in_tensor;
        vsi_nn_internal_setup_node(self, curr);
    }
    else
    {
        crop_in_tensor = inputs[0];
    }
    if ((width_in_eff_ == (int32_t)outputs[0]->attr.size[0]) && (height_in_eff_ == (int32_t)outputs[0]->attr.size[1]))
    {
        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 1, 1 );
        curr->inputs[0]  = crop_in_tensor;
        curr->outputs[0] = outputs[0];
        vsi_nn_internal_setup_node(self, curr);
    }
    else
    {
        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_INTERNAL, 1, 1 );
        curr->node->nn_param.resize_internal.align_corners = vx_true_e;
        curr->node->nn_param.resize_internal.factor = factor;
        curr->node->nn_param.resize_internal.half_pixel_centers = vx_false_e;
        curr->inputs[0]  = crop_in_tensor;
        curr->outputs[0] = outputs[0];
        vsi_nn_internal_setup_node(self, curr);
    }
    return TRUE;
 } /* op_setup() */
 static vsi_status op_optimize
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs,
    vsi_nn_opt_direction_e direction
    )
 {
    vsi_status     status;
    status = VSI_SUCCESS;
    vsi_nn_internal_optimize_node( self, direction );
    return status;
 } /* op_optimize() */
 static vsi_status op_init
    (
    vsi_nn_node_t* self
    )
 {
    vsi_status status = VSI_SUCCESS;
    status = vsi_nn_internal_init_node_wksp(self);
    self->nn_param.interp.height         = 0;
    self->nn_param.interp.width          = 0;
    self->nn_param.interp.pad_beg        = 0;
    self->nn_param.interp.pad_end        = 0;
    self->nn_param.interp.shrink_factor  = 0;
    self->nn_param.interp.zoom_factor    = 0;
    return status;
 } /* op_init() */
 static vsi_status op_deinit
    (
    vsi_nn_node_t* self
    )
 {
    vsi_status status = VSI_SUCCESS;
    vsi_nn_internal_deinit_node_wksp(self);
    status = vsi_nn_op_common_deinit(self);
    return status;
 } /* op_deinit() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ INTERP,
    /* init       */ op_init,
    /* compute    */ op_compute,
    /* deinit     */ op_deinit,
    /* check      */ op_check,
    /* setup      */ op_setup,
    /* optimize   */ op_optimize,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
@ -155,11 +155,13 @@ static vsi_bool op_check
    BEGIN_IO_TYPE_DECL(PERMUTE, 1, 1)
        IO_TYPE(D_F16,  D_F16)
        IO_TYPE(D_F16,  D_F32)
        IO_TYPE(D_I16,  D_I16)
        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM,  D_F16)
        IO_TYPE(D_U8|Q_ASYM,  D_F32)
        IO_TYPE(D_I8|Q_SYM_PC,   D_I8|Q_SYM_PC)
        IO_TYPE(D_BOOL8,  D_BOOL8)
        IO_TYPE(D_BOOL8,  D_I8|Q_DFP)
        IO_TYPE(D_F32,  D_F32)
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c
@ -0,0 +1,207 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 /*
 Declare number of input and output.
 */
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 static vsi_bool _is_same_shape
    (
    vsi_nn_tensor_t * inputs,
    uint32_t *sizes,
    uint32_t dims
    )
 {
    uint32_t i = 0;
    if (inputs->attr.dim_num != dims)
        return FALSE;
    for (i = 0; i < dims; i++)
    {
        if (sizes[i] != inputs->attr.size[i])
            return FALSE;
    }
    return TRUE;
 }
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_status status = VSI_FAILURE;
    status = vsi_nn_internal_compute_node( self );
    return status;
 } /* op_compute() */
 static vsi_status op_optimize
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs,
    vsi_nn_opt_direction_e direction
    )
 {
    if ( _is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num) )
    {
        return vsi_nn_internal_optimize_node(self, direction );
    }
    else
    {
        return VSI_SUCCESS;
    }
 } /* op_optimize() */
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    return TRUE;
 } /* op_check() */
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    float factor = self->nn_param.resize_1d.factor;
    vsi_nn_internal_node_t* curr = NULL;
    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
        if (factor != 0)
        {
            outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor);
        }
        else
        {
            outputs[0]->attr.size[0] = self->nn_param.resize_1d.size[0];
        }
        outputs[0]->attr.size[1] = inputs[0]->attr.size[1];
        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
    }
    if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num))
    {
        vsi_nn_internal_init_node_wksp( self );
        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
        curr->inputs[0]  = inputs[0];
        curr->outputs[0] = outputs[0];
        vsi_nn_internal_setup_node(self, curr);
    }
    else if (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize_1d.type)
    {
        vsi_nn_internal_init_node_wksp( self );
        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_1D_BILINEAR_INTERNAL, 0, 0 );
        curr->node->nn_param.resize_1d_bilinear_internal.align_corners = self->nn_param.resize_1d.align_corners;
        curr->node->nn_param.resize_1d_bilinear_internal.factor = self->nn_param.resize_1d.factor;
        curr->node->nn_param.resize_1d_bilinear_internal.half_pixel_centers = \
                                              self->nn_param.resize_1d.half_pixel_centers;
        curr->inputs[0]  = inputs[0];
        curr->outputs[0] = outputs[0];
        vsi_nn_internal_setup_node(self, curr);
    }
    else if (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize_1d.type)
    {
        vsi_nn_internal_init_node_wksp( self );
        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_1D_NEAREST_INTERNAL, 0, 0 );
        curr->node->nn_param.resize_1d_nearest_internal.align_corners = self->nn_param.resize_1d.align_corners;
        curr->node->nn_param.resize_1d_nearest_internal.factor = self->nn_param.resize_1d.factor;
        curr->node->nn_param.resize_1d_nearest_internal.half_pixel_centers = \
                                              self->nn_param.resize_1d.half_pixel_centers;
        curr->inputs[0]  = inputs[0];
        curr->outputs[0] = outputs[0];
        vsi_nn_internal_setup_node(self, curr);
    }
    return TRUE;
 } /* op_setup() */
 static vsi_status op_init
    (
    vsi_nn_node_t* self
    )
 {
    return VSI_SUCCESS;
 } /* op_init() */
 static vsi_status op_deinit
    (
    vsi_nn_node_t* self
    )
 {
    vsi_status status = VSI_SUCCESS;
    status = vsi_nn_internal_deinit_node_wksp(self);
    return status;
 } /* op_deinit() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ RESIZE_1D,
    /* init       */ op_init,
    /* compute    */ op_compute,
    /* deinit     */ op_deinit,
    /* check      */ op_check,
    /* setup      */ op_setup,
    /* optimize   */ op_optimize,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c
@ -0,0 +1,171 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 /*
 Declare number of input and output.
 */
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_status status = VSI_FAILURE;
    int32_t  align_corners       = self->nn_param.resize_1d_bilinear_internal.align_corners;
    int32_t  half_pixel_centers  = self->nn_param.resize_1d_bilinear_internal.half_pixel_centers;
    vsi_nn_kernel_param_t * param = NULL;
    param = vsi_nn_kernel_param_create();
    vsi_nn_kernel_param_add_int32( param, "align_corners",  align_corners );
    vsi_nn_kernel_param_add_int32( param, "half_pixel_centers",  half_pixel_centers );
    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
            "resize_1d_bilinear",
            &inputs[0], 1,
            &outputs[0], 1, param );
    if ( self->n )
    {
        status = VSI_SUCCESS;
    }
    vsi_nn_kernel_param_release( &param );
    return status;
 } /* op_compute() */
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    BEGIN_IO_TYPE_DECL(RESIZE_1D_BILINEAR_INTERNAL, 1, 1)
        IO_TYPE(D_F16,  D_U8|Q_ASYM)
        IO_TYPE(D_F16,  D_F16)
        IO_TYPE(D_BF16, D_BF16)
        IO_TYPE(D_F32,  D_F32)
        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM,  D_F16)
        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
    END_IO_TYPE_DECL(RESIZE_1D_BILINEAR_INTERNAL)
    if (!VALIDATE_OP_IO_TYPES(RESIZE_1D_BILINEAR_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) {
        char* desc = generate_op_io_types_desc(inputs,
                self->input.num, outputs, self->output.num);
        VSILOGE("Inputs/Outputs data type not support: %s", desc);
        destroy_op_io_types_desc(desc);
        return FALSE;
    }
    return TRUE;
 } /* op_check() */
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    float factor = self->nn_param.resize_1d_bilinear_internal.factor;
    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
        if (factor != 0)
        {
            outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor);
        }
        else
        {
            outputs[0]->attr.size[0] = self->nn_param.resize_1d.size[0];
        }
        outputs[0]->attr.size[1] = inputs[0]->attr.size[1];
        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
    }
    return TRUE;
 } /* op_setup() */
 static vsi_status op_init
    (
    vsi_nn_node_t* self
    )
 {
    return VSI_SUCCESS;
 } /* op_init() */
 static vsi_status op_deinit
    (
    vsi_nn_node_t* self
    )
 {
    vsi_status status = VSI_SUCCESS;
    status = vsi_nn_op_common_deinit(self);
    return status;
 } /* op_deinit() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ RESIZE_1D_BILINEAR_INTERNAL,
    /* init       */ op_init,
    /* compute    */ op_compute,
    /* deinit     */ op_deinit,
    /* check      */ op_check,
    /* setup      */ op_setup,
    /* optimize   */ NULL,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c
@ -0,0 +1,170 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 /*
 Declare number of input and output.
 */
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_status status = VSI_FAILURE;
    int32_t  align_corners       = self->nn_param.resize_1d_nearest_internal.align_corners;
    int32_t  half_pixel_centers  = self->nn_param.resize_1d_nearest_internal.half_pixel_centers;
    vsi_nn_kernel_param_t * param = NULL;
    param = vsi_nn_kernel_param_create();
    vsi_nn_kernel_param_add_int32( param, "align_corners",  align_corners );
    vsi_nn_kernel_param_add_int32( param, "half_pixel_centers",  half_pixel_centers );
    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
            "resize_1d_nearest",
            &inputs[0], 1,
            &outputs[0], 1, param );
    if ( self->n )
    {
        status = VSI_SUCCESS;
    }
    vsi_nn_kernel_param_release( &param );
    return status;
 } /* op_compute() */
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    BEGIN_IO_TYPE_DECL(RESIZE_1D_NEAREST_INTERNAL, 1, 1)
        IO_TYPE(D_F16,  D_F16)
        IO_TYPE(D_BF16, D_BF16)
        IO_TYPE(D_F32,  D_F32)
        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
    END_IO_TYPE_DECL(RESIZE_1D_NEAREST_INTERNAL)
    if (!VALIDATE_OP_IO_TYPES(RESIZE_1D_NEAREST_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) {
        char* desc = generate_op_io_types_desc(inputs,
                self->input.num, outputs, self->output.num);
        VSILOGE("Inputs/Outputs data type not support: %s", desc);
        destroy_op_io_types_desc(desc);
        return FALSE;
    }
    return TRUE;
 } /* op_check() */
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    float factor = self->nn_param.resize_1d_nearest_internal.factor;
    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
        if (factor != 0)
        {
            outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor);
        }
        else
        {
            outputs[0]->attr.size[0] = self->nn_param.resize_1d.size[0];
        }
        outputs[0]->attr.size[1] = inputs[0]->attr.size[1];
        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
    }
    return TRUE;
 } /* op_setup() */
 static vsi_status op_init
    (
    vsi_nn_node_t* self
    )
 {
    return VSI_SUCCESS;
 } /* op_init() */
 static vsi_status op_deinit
    (
    vsi_nn_node_t* self
    )
 {
    vsi_status status = VSI_SUCCESS;
    status = vsi_nn_op_common_deinit(self);
    return status;
 } /* op_deinit() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ RESIZE_1D_NEAREST_INTERNAL,
    /* init       */ op_init,
    /* compute    */ op_compute,
    /* deinit     */ op_deinit,
    /* check      */ op_check,
    /* setup      */ op_setup,
    /* optimize   */ NULL,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c
@ -163,12 +163,17 @@ static vsi_bool op_check
        ret = FALSE;
    }
    if(ret)
    {
        BEGIN_IO_TYPE_DECL(SVDF, 5, 2)
            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16, D_F16, D_F16)
            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16, D_F16)
            IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_F32, D_F32, D_F16)
            IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32, D_F32)
            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16, D_F16, D_NONE)
            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16, D_NONE)
            IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_F32, D_F32, D_NONE)
            IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32, D_NONE)
        END_IO_TYPE_DECL(SVDF)
        if(!VALIDATE_OP_IO_TYPES(SVDF, self, inputs, self->input.num, outputs, self->output.num)) {
            char* desc = generate_op_io_types_desc(inputs,
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@ -117,8 +117,7 @@ static void _try_pack_tensor_data
            {
                *p_sz = (uint64_t)bytes;
            }
-            free( data );
+            vsi_nn_safe_free( data );
            data = NULL;
        }
    }
 } /* _pack_tensor_data() */
@ -417,6 +416,8 @@ static _op_param_gen_t s_op_gen[] =
    /* PRE_PROCESS_NV12 */      NULL,
    /* SCATTER_ND */            NULL,
    /* DECONVOLUTION1D */       NULL,
    /* INTERP */                NULL,
    /* RESIZE_1D */             NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
--- a/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c
@ -149,7 +149,7 @@ vsi_nn_tensor_t* vsi_nn_Concat
            src = j;
            memcpy( &buffer[dst * type_bytes], &tmp[src * type_bytes], type_bytes );
        }
-        free(tmp);
+        vsi_nn_safe_free( tmp );
        offset += strides[axis] * tensors[i]->attr.size[axis];
    }
    tensor_out = vsi_nn_CreateTensorFromData( graph, buffer, &output_attr );
@ -221,11 +221,7 @@ vsi_nn_tensor_t* vsi_nn_ConvertTensorDtype
        }
    }
-    if( src_buf )
+    vsi_nn_safe_free( src_buf );
    {
        free( src_buf );
        src_buf = NULL;
    }
    if( dst_buf )
    {
        free( dst_buf );
@ -333,10 +329,7 @@ vsi_nn_tensor_t* vsi_nn_TensorAdd
 error:
    for ( i = 0; i < tensor_num; i++ )
    {
-        if ( buffer[i] )
+        vsi_nn_safe_free( buffer[i] );
        {
            free(buffer[i]);
        }
    }
    if( tmp )
    {
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@ -710,33 +710,6 @@ vsi_bool vsi_nn_CheckFilePath
    return FALSE;
 } /* vsi_nn_CheckFilePath() */
 void vsi_nn_GetFP32MultiAndPostShift
    (
    vx_float32 mult,
    vx_uint16 *M0,
    vx_int8 *N
    )
 {
    vx_uint32 uintMult          = *((vx_uint32*)(&mult));
    vx_uint32 tmpMultiply       = 0;
    vx_int32  exp               = 0;
    vx_uint32 postShiftBit6to5  = 0;
    vx_uint32 postShift         = 0;
    vx_int8   tmpPostShift      = 0;
    tmpMultiply         = (uintMult & 0x7FFFFF) >> 8;
    *M0                 = (vx_uint16)((1U << 15) + tmpMultiply);
    exp                 = (uintMult & 0x7F800000) >> 23; /* postShift is Scale's exp*/
    tmpPostShift        = 15 - ((vx_int8)exp - 127);
    postShift           = tmpPostShift & 0x1F;
    tmpPostShift        = tmpPostShift >> 5;
    postShiftBit6to5    = tmpPostShift & 3;
    *N = (vx_int8)(((postShiftBit6to5 << 5) | (postShift & 0x1F)));
    *N = (((vx_int32)*N << 25) >> 25);
 }/* vsi_nn_GetFP32MultiAndPostShift() */
 typedef struct
 {
    uint8_t* raw_addr;
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@ -520,6 +520,7 @@ static vx_tensor _create_const_raw_tensor
    vx_tensor tensor = NULL;
    vx_tensor_create_params_t params;
    float * scales = NULL;
    int32_t * zeroPoints = NULL;
    memset( &params, 0, sizeof( vx_tensor_create_params_t ) );
    params.num_of_dims = attr.dim_num;
@ -539,12 +540,14 @@ static vx_tensor _create_const_raw_tensor
 #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
        // This is a hack that driver doesn't support const scale
        scales = (float *)malloc(sizeof(float) * attr.dtype.scale_dim);
        zeroPoints = (int32_t *)malloc(sizeof(int32_t) * attr.dtype.zero_points_dim);
        memcpy(scales, attr.dtype.scales, attr.dtype.scale_dim * sizeof(float));
        memcpy(zeroPoints, attr.dtype.zero_points, attr.dtype.zero_points_dim * sizeof(float));
        params.quant_data.affinePerChannel.channelDim = attr.dtype.channel_dim;
        params.quant_data.affinePerChannel.scaleCount = attr.dtype.scale_dim;
        params.quant_data.affinePerChannel.scales = scales;
-        params.quant_data.affinePerChannel.zeroPoint = NULL;
+        params.quant_data.affinePerChannel.zeroPoint = zeroPoints;
-        params.quant_data.affinePerChannel.zeroPointCount = 0;
+        params.quant_data.affinePerChannel.zeroPointCount = attr.dtype.zero_points_dim;
        break;
 #else
    VSILOGE( "can't support qnt_type VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC." );
@ -580,6 +583,10 @@ static vx_tensor _create_const_raw_tensor
                    {
                        free( scales );
                    }
                    if (zeroPoints)
                    {
                        free( zeroPoints );
                    }
                    return NULL;
                }
            }
@ -620,6 +627,10 @@ static vx_tensor _create_const_raw_tensor
    {
        free( scales );
    }
    if (zeroPoints)
    {
        free( zeroPoints );
    }
    return tensor;
 } /* _create_const_raw_tensor() */
@ -689,6 +700,8 @@ static void _convert_const_I8toU8
    if ( tensor->t ) vxReleaseTensor(&tensor->t);
    tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr);
    vsi_nn_safe_free( data );
 }/* _convert_const_I8toU8() */
 static vsi_status _convert_graph_const_tensor
--- a/src/tim/vx/internal/src/vsi_nn_log.c
+++ b/src/tim/vx/internal/src/vsi_nn_log.c
@ -29,25 +29,49 @@
 #include "vsi_nn_log.h"
 #include "vsi_nn_types.h"
 #ifdef __ANDROID__
 #if ANDROID_SDK_VERSION >= 30
 static const char* ENV_LOG_LEVEL = "vendor.VSI_NN_LOG_LEVEL";
 #else
 static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL";
 #endif
 #else
 static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL";
 #endif
 int get_env_as_int(const char* env, int default_value) {
    int value = default_value;
    #ifdef __ANDROID__
    {
        char value_str[100];
        int status = __system_property_get(env, value_str);
        if (status) {
            value = atoi(value_str);
        }
    }
    #else
    {
        char* env_s = getenv(env);
        if (env_s) {
            value = atoi(env_s);
        }
    }
    #endif
    return value;
 }
 static vsi_bool _check_log_level
    (
    vsi_nn_log_level_e level
    )
 {
    char *env_level_s;
    static vsi_nn_log_level_e env_level = VSI_NN_LOG_UNINIT;
    if(env_level == VSI_NN_LOG_UNINIT)
    {
-        env_level_s = getenv("VSI_NN_LOG_LEVEL");
+        env_level = (vsi_nn_log_level_e)get_env_as_int(ENV_LOG_LEVEL, VSI_NN_LOG_WARN);
        if(env_level_s)
        {
            env_level = (vsi_nn_log_level_e)atoi(env_level_s);
        }
        else
        {
            env_level = VSI_NN_LOG_WARN;
        }
    }
    if(env_level >= level)
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@ -844,7 +844,7 @@ float * vsi_nn_ConvertTensorToFloat32Data
    if( !tensor->attr.is_created_from_handle )
    {
-        if(tensor_data)free(tensor_data);
+        vsi_nn_safe_free( tensor_data );
    }
    return data;
 } /* vsi_nn_ConvertTensorToFloat32Data() */
@ -1095,7 +1095,7 @@ void vsi_nn_SaveTensorToTextByFp32
    }
    fwrite( buf, count, 1, fp );
    fclose( fp );
-    free( data );
+    vsi_nn_safe_free( data );
 } /* vsi_nn_SaveTensorToTextByFp32() */
 void vsi_nn_SaveTensorToText
@ -1124,7 +1124,7 @@ void vsi_nn_SaveTensorToText
    sz = vsi_nn_GetElementNum( tensor );
    vsi_nn_SaveDataToText( filename, data, sz,
        tensor->attr.dtype.vx_type, seperator );
-    free( data );
+    vsi_nn_safe_free( data );
 } /* vsi_nn_SaveTensorToText() */
 void vsi_nn_SaveDataToText
@ -1219,7 +1219,7 @@ void vsi_nn_SaveTensorToBinary
    }
    fwrite( data, sz, 1, fp );
    fclose( fp );
-    free( data );
+    vsi_nn_safe_free( data );
 } /* vsi_nn_SaveTensorToBinary() */
 vsi_nn_tensor_t * vsi_nn_CreateTensorFromData
@ -1539,7 +1539,7 @@ void vsi_nn_TransposeTensor
        VSILOGE( "Copy transpose data fail with code %#x.", status );
    }
-    free( buf );
+    vsi_nn_safe_free( buf );
    free( dst );
 } /* vsi_nn_TransposeTensor() */
@ -1588,7 +1588,7 @@ void vsi_nn_PermuteTensor
        if( perm[i] >= dim_num )
        {
            VSILOGW( "Incorrect perm %d", perm[i] );
-            if( buf ) { free(buf); buf = NULL; }
+            vsi_nn_safe_free( buf );
            if( dst ) { free(dst); dst = NULL; }
            return;
        }
@ -1603,7 +1603,7 @@ void vsi_nn_PermuteTensor
        VSILOGE( "Copy permute data fail with code %#x.", status );
    }
-    if( buf ) { free(buf); buf = NULL; }
+    vsi_nn_safe_free( buf );
    if( dst ) { free(dst); dst = NULL; }
 } /* vsi_nn_PermuteTensor() */
@ -2241,7 +2241,7 @@ void vsi_nn_reshuffle_weight_data
    }
    vsi_nn_CopyDataToTensor( graph, weights, weight_data );
    vsi_nn_Free( buffer );
-    vsi_nn_Free( weight_data );
+    vsi_nn_safe_free( weight_data );
 }
 vsi_nn_tensor_t* vsi_nn_ConcatTensor_impl