Update internal ovxlib to rel/1.2.2 (#674)

Update to SHA:806fcd6a69d333e62508acf0a6aa2c38c8385eae Type: Code Improvement Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
2024-01-03 13:13:15 +08:00 · 2024-01-03 13:13:15 +08:00 · 2d9e614a06
parent cf099e3849
commit 2d9e614a06
203 changed files with 18939 additions and 5096 deletions
--- a/src/tim/vx/internal/.gitignore
+++ b/src/tim/vx/internal/.gitignore
@ -3,6 +3,9 @@
 ##
 ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore

+# Some header file
+include/vsi_nn_feature_config.h
+
 # User-specific files
 *.suo
 *.user
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -195,3 +195,5 @@ DEF_OP(GRID_SAMPLE)
 DEF_OP(LPNORM)
 DEF_OP(RESIZE_3D)
 DEF_OP(REDUCEL2)
+DEF_OP(CROP_AND_RESIZE)
+DEF_OP(TAN)
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
@ -55,6 +55,7 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
    VSI_NN_KERNEL_LUT_ATANH            = 21,
    VSI_NN_KERNEL_LUT_ACOSH            = 22,
    VSI_NN_KERNEL_LUT_INVERSE_SIGMOID  = 23,
+    VSI_NN_KERNEL_LUT_TAN              = 24,

 };

--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h
@ -106,10 +106,21 @@ enum
    BI_LSTM_BW_INPUT_LAYERNORM_C    = 54,
    BI_LSTM_BW_INPUT_LAYERNORM_O    = 55,

+    BI_LSTM_FW_INPUT_BIAS_R2I       = 56,
+    BI_LSTM_FW_INPUT_BIAS_R2F       = 57,
+    BI_LSTM_FW_INPUT_BIAS_R2C       = 58,
+    BI_LSTM_FW_INPUT_BIAS_R2O       = 59,
+
+    BI_LSTM_BW_INPUT_BIAS_R2I       = 60,
+    BI_LSTM_BW_INPUT_BIAS_R2F       = 61,
+    BI_LSTM_BW_INPUT_BIAS_R2C       = 62,
+    BI_LSTM_BW_INPUT_BIAS_R2O       = 63,
+
    BI_LSTM_INPUT_CNT,

    BI_LSTM_FW_OUTPUT_OUTPUT      = 0,
    BI_LSTM_BW_OUTPUT_OUTPUT      = 1,
+
    BI_LSTM_OUTPUT_CNT
 };

--- a/src/tim/vx/internal/include/ops/vsi_nn_op_crop_and_resize.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_crop_and_resize.h
@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CROP_AND_RESIZE_H
+#define _VSI_NN_OP_CROP_AND_RESIZE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_crop_and_resize_param
+{
+    struct _crop_and_resize_local_data_t * lcl_data;
+    const int32_t* crop_size;
+    vsi_enum resize_method;
+    float extrapolation_value;
+} vsi_nn_crop_and_resize_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h
@ -70,6 +70,11 @@ enum
    LSTM_INPUT_AUX_WEIGHT_I2C = 27,
    LSTM_INPUT_AUX_WEIGHT_I2O = 28,

+    LSTM_INPUT_BIAS_R2I       = 29,
+    LSTM_INPUT_BIAS_R2F       = 30,
+    LSTM_INPUT_BIAS_R2C       = 31,
+    LSTM_INPUT_BIAS_R2O       = 32,
+
    LSTM_INPUT_CNT,

    LSTM_OUTPUT_OUTPUT      = 0,
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h
@ -74,6 +74,11 @@ enum
    LSTMUNIT_INPUT_AUX_WEIGHT_I2C = 27,
    LSTMUNIT_INPUT_AUX_WEIGHT_I2O = 28,

+    LSTMUNIT_INPUT_BIAS_R2I       = 29,
+    LSTMUNIT_INPUT_BIAS_R2F       = 30,
+    LSTMUNIT_INPUT_BIAS_R2C       = 31,
+    LSTMUNIT_INPUT_BIAS_R2O       = 32,
+
    LSTMUNIT_INPUT_CNT,

    LSTMUNIT_OUTPUT_OUTPUT      = 0,
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h
@ -38,7 +38,8 @@ typedef uint32_t vsi_nn_interpolation_type_t; enum
 {
    VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR = 0,
    VSI_NN_INTERPOLATION_BILINEAR,
-    VSI_NN_INTERPOLATION_AREA
+    VSI_NN_INTERPOLATION_AREA,
+    VSI_NN_INTERPOLATION_CUBIC
 };

 typedef uint32_t vsi_nn_resize_layout_type_t; enum
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h
@ -33,6 +33,7 @@ extern "C" {
 typedef struct _vsi_nn_scatter_nd_update_param
 {
    vsi_bool use_locking;
+    vsi_nn_reduction_type_e reduction;
 } vsi_nn_scatter_nd_update_param;

 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@ -471,6 +471,12 @@ char* vsi_nn_getenv
    const char * var_name
    );

+int32_t vsi_nn_getenv_asint
+    (
+        const char* env,
+        int32_t default_value
+    );
+
 FILE* vsi_nn_fopen
    (
    const char * file_name,
--- a/src/tim/vx/internal/include/vip/virtual_device.h
+++ b/src/tim/vx/internal/include/vip/virtual_device.h
@ -43,6 +43,7 @@ class IDevice {
        OVXLIB_API IDevice(uint32_t id);
        OVXLIB_API ~IDevice();
        OVXLIB_API uint32_t Id() const;
+        OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, bool (*func)(const void*), data_t data);
        OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data);
        OVXLIB_API bool GraphRemove(const vsi_nn_graph_t* graph);
        OVXLIB_API bool ThreadExit();
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -79,6 +79,8 @@ typedef struct _vsi_nn_runtime_option_t
    int32_t enable_dataconvert_optimize;
    int32_t enable_stream_processor;
    int32_t enable_rgb88_planar_nhwc;
+    int32_t enable_slice_optimize;
+    int32_t enable_batch_opt;
 } vsi_nn_runtime_option_t;

 /**
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@ -1,3 +1,26 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the Software),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
 /*****Auto generated header file, Please DO NOT modify manually!*****/
 #ifndef _VSI_NN_FEATURE_CONFIG_H
 #define _VSI_NN_FEATURE_CONFIG_H
@ -20,5 +43,15 @@
 #define VSI_CONCAT_ENHANCE_SUPPORT
 #endif
 #define VSI_CREATE_TENSOR_FROM_VIEW_SUPPORT
+#ifndef VSI_SWAP_HANDLE_CACHE_SUPPORT
+#define VSI_SWAP_HANDLE_CACHE_SUPPORT
+#endif
+#define VSI_EXPORT_APIS_FOR_SETUP_GRAPH 1
+#if defined(VX_SET_TENSOR_MEMPOOL_TYPE_SUPPORT) && VX_SET_TENSOR_MEMPOOL_TYPE_SUPPORT
+#define VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT
+#endif
+#if defined(VX_13_NN_COMPATIBLITY)
+#define VSI_MAP_TENSOR_PATCH_SUPPORT
+#endif

 #endif
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@ -382,6 +382,31 @@ OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromView
    vsi_size_t* end
    );

+/**
+ * Add a new tensor from AXI-SRAM
+ * Create a new tensor from internal AXI-SRAM and add it to graph.
+ * It just creates the tensor object and does not actually allocate the memory
+ * in AXI-SRAM until the verify graph stage. In the other words, the tensor object is
+ * created beforehand,but the memory for storing its data is not allocate until verify
+ * graph stage. AXI-SRAM is the internal memory resource that memory allocation is done
+ * strategically to optimize performance and resource usage in graph verification.
+ * If there is no enough memory in AXI-SRAM, vsi_nn_VerifyGraph will return VSI_FAILURE
+ * User can't access the tensor memory(read/write tensor data) before the graph has verified,
+ * since the tensor memory is not allocated.
+ * @param[in] graph Graph handle
+ * @param[in] id Optional id to the tensor, set it to VSI_NN_TENSOR_ID_AUTO,
+ *           and a new id will be generated.
+ * @param[in] attr Tensor attirbutes to the new tensor.
+ *
+ * @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otheriwse.
+ */
+OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromAXISRAM
+    (
+    vsi_nn_graph_t       * graph,
+    vsi_nn_tensor_id_t     id,
+    vsi_nn_tensor_attr_t * attr
+    );
+
 /**
 * Attach tensor to graph
 * Attach an exist tensor to graph.
@ -796,6 +821,18 @@ OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption
    size_t size
    );

+/**
+ * graph shape inference
+ *
+ * @param[in] graph Graph handle
+ *
+ * @return VSI_SUCCESS on success, or appropriate error code otherwise
+ * */
+OVXLIB_API vsi_status vsi_nn_InferShape
+(
+    vsi_nn_graph_t* graph
+);
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_node.h
+++ b/src/tim/vx/internal/include/vsi_nn_node.h
@ -155,6 +155,22 @@ OVXLIB_API void vsi_nn_PrintNode
    vsi_nn_node_id_t id
    );

+#if VX_GRAPH_BATCH_OPT_SUPPORT
+/**
+ * Set how much this node is divided into on batch dim.
+ *
+ * @param[in] node Node.
+ * @param[in] split_num.
+ *
+ * @return VSI_SUCCESS on success, or error core otherwise.
+ */
+OVXLIB_API vsi_status vsi_nn_SetNodeBatchSplitNum
+(
+    vsi_nn_node_t* node,
+    int8_t split_num
+);
+#endif
+
 /**
 * Update node attribute
 * Update openvx node attribute based on ovxlib's node attribute
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -209,6 +209,7 @@
 #include "ops/vsi_nn_op_lpnorm.h"
 #include "ops/vsi_nn_op_resize_3d.h"
 #include "ops/vsi_nn_op_reducel2.h"
+#include "ops/vsi_nn_op_crop_and_resize.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -406,6 +407,7 @@ typedef union _vsi_nn_nn_param
    vsi_nn_lpnorm_param             lpnorm;
    vsi_nn_resize_3d_param          resize_3d;
    vsi_nn_reducel2_param           reducel2;
+    vsi_nn_crop_and_resize_param    crop_and_resize;
    void*                         client_param;

    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_platform.h
+++ b/src/tim/vx/internal/include/vsi_nn_platform.h
@ -35,6 +35,9 @@
 #if defined(VX_KHR_COMPATIBILITY) && (0x1==VX_KHR_COMPATIBILITY)
 #include <VX/vx_khr_compatible.h>
 #endif
+#ifdef VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT
+#include <VX/vx_viv_sys.h>
+#endif

 /*
    This is a compatibility head file for backward compatibility OpenVX 1.1 spec
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@ -89,6 +89,8 @@ typedef enum
    VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422,
    VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422,
    VSI_NN_SOURCE_FORMAT_IMAGE_NV21,
+    VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB,
+    VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR,
 } vsi_nn_preprocess_source_format_e;

 /**
--- a/src/tim/vx/internal/include/vsi_nn_pub.h
+++ b/src/tim/vx/internal/include/vsi_nn_pub.h
@ -54,5 +54,10 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "quantization/vsi_nn_asymmetric_affine.h"
 #include "quantization/vsi_nn_dynamic_fixed_point.h"
+
+#if defined(VSI_ENABLE_LCOV_TEST) && VSI_ENABLE_LCOV_TEST
+#include "lcov/vsi_nn_coverage.h"
+#endif
+
 #endif

--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@ -817,6 +817,82 @@ vsi_nn_tensor_t * vsi_nn_dropout_tensor
    float             rate
    );

+/**
+ * Allows the application to get direct access to a patch of tensor object.
+ * A wrapper api for OpenVX vxMapTensorPatch
+ *
+ * @param[in] graph Graph handle.
+ * @param[in] tensor Tensor handle.
+ * @param[out]  ptr The address of a pointer that the function sets to the
+ * address where the requested data can be accessed. The returned (*ptr) address
+ * is only valid between the call to the function and the corresponding call to
+ * vsi_nn_UnmapTensorPatch.
+ * @param [in] usage This declares the access mode for the tensor patch, using
+ * the vsi_nn_accessor_type_e enumeration.
+ * VSI_NN_READ_ONLY: after the function call, the content of the memory location
+ * pointed by (*ptr) contains the tensor patch data. Writing into this memory location
+ * is forbidden and its behavior is undefined.
+ * VSI_NN_READ_AND_WRITE : after the function call, the content of the memory
+ * location pointed by (*ptr) contains the tensor patch data; writing into this memory
+ * is allowed only for the location of items and will result in a modification of the
+ * affected items in the tensor object once the range is unmapped. Writing into
+ * a gap between items (when (*stride) > item size in bytes) is forbidden and its
+ * behavior is undefined.
+ * VSI_NN_WRITE_ONLY: after the function call, the memory location pointed by (*ptr)
+ * contains undefined data; writing each item of the range is required prior to
+ * unmapping. Items not written by the application before unmap will become
+ * undefined after unmap, even if they were well defined before map. Like for
+ * VSI_NN_READ_AND_WRITE, writing into a gap between items is forbidden and its behavior
+ * is undefined.
+ * @return VSI_SUCCESS on success, or error core otherwise.
+ */
+
+OVXLIB_API vsi_status vsi_nn_MapTensorPatch
+   (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor,
+    void** ptr,
+    vsi_nn_accessor_type_e usage
+   );
+
+/**
+ * Unmap and commit potential changes to a tensor object patch that was previously mapped.
+ * Unmapping a tensor patch invalidates the memory location from which the patch could
+ * be accessed by the application. Accessing this memory location after the unmap function
+ * completes has an undefined behavior.
+ * @param[in] graph Graph handle.
+ * @param [in] tensor The reference to the tensor object to unmap.
+ * return VSI_SUCCESS on success, or error core otherwise.
+ */
+
+OVXLIB_API vsi_status vsi_nn_UnmapTensorPatch
+   (
+   vsi_nn_graph_t* graph,
+   vsi_nn_tensor_t* tensor
+   );
+
+/**
+ * Create a new tensor from internal AXI-SRAM(Kernel driver maped)
+ * It just creates the tensor object and does not actually allocate the memory
+ * in AXI-SRAM until the verify graph stage. In the other words, the tensor
+ * object is created beforehand,but the memory for storing its data is not
+ * allocate until verify graph stage. AXI-SRAM is the internal memory resource
+ * that memory allocation is done strategically to optimize performance and
+ * resource usage in graph verification.
+ * If there is no enough memory in AXI-SRAM, vsi_nn_VerifyGraph will return VSI_FAILURE
+ * User can't access the tensor memory(read/write tensor data) before the graph has verified,
+ * since the tensor memory is not allocated.
+ * @param[in] graph Graph handle
+ * @param[in] attr Tensor attirbutes to the new tensor.
+ *
+ * @return Tensor handle on success, or NULL otherwise.
+ */
+OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateTensorFromAXISRAM
+    (
+    vsi_nn_graph_t       * graph,
+    vsi_nn_tensor_attr_t * attr
+    );
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_types.h
+++ b/src/tim/vx/internal/include/vsi_nn_types.h
@ -115,7 +115,9 @@ typedef enum
 {
    VSI_NN_REDUCTION_TYPE_NONE,
    VSI_NN_REDUCTION_TYPE_ADD,
-    VSI_NN_REDUCTION_TYPE_MUL
+    VSI_NN_REDUCTION_TYPE_MUL,
+    VSI_NN_REDUCTION_TYPE_MAX,
+    VSI_NN_REDUCTION_TYPE_MIN
 } vsi_nn_reduction_type_e;

 /** Pad mode enum */
@ -269,7 +271,9 @@ typedef enum _vsi_nn_yuv_type
 typedef enum _vsi_nn_nv_type
 {
    VSI_NN_YUV_TYPE_NV12,
-    VSI_NN_YUV_TYPE_NV21
+    VSI_NN_YUV_TYPE_NV21,
+    VSI_NN_YUV_TYPE_NV12_RGGB,
+    VSI_NN_YUV_TYPE_NV21_BGGR
 }vsi_nn_nv_type;

 typedef enum _vsi_nn_roi_align_type_e
@ -283,6 +287,12 @@ typedef enum _vsi_nn_custom_warp_affine_type_e {
    VSI_NN_WARP_AFFINE_TYPE_RGB
 } vsi_nn_custom_warp_affine_type_e;

+typedef enum _vsi_nn_accessor_type_e {
+    VSI_NN_READ_ONLY = VX_READ_ONLY,
+    VSI_NN_WRITE_ONLY = VX_WRITE_ONLY,
+    VSI_NN_READ_AND_WRITE = VX_READ_AND_WRITE
+} vsi_nn_accessor_type_e;
+
 /** Deprecated */
 typedef uint32_t vsi_nn_size_t;

--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -32,8 +32,8 @@ extern "C"{
 #endif

 #define VSI_NN_VERSION_MAJOR 1
-#define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 88
+#define VSI_NN_VERSION_MINOR 2
+#define VSI_NN_VERSION_PATCH 2
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

--- a/src/tim/vx/internal/src/Android.mk
+++ b/src/tim/vx/internal/src/Android.mk
@ -14,6 +14,10 @@ ifeq ($(PLATFORM_VENDOR),1)
 LOCAL_VENDOR_MODULE  := true
 endif

+$(info Remove $(LOCAL_PATH)/../include/vsi_nn_feature_config.h ...)
+$(shell rm $(LOCAL_PATH)/../include/vsi_nn_feature_config.h -rf)
+$(info $(shell bash $(LOCAL_PATH)/../gcc_gen_feature_config_header.sh $(LOCAL_PATH)/..))
+
 LOCAL_SRC_FILES :=     \
            vsi_nn_context.c \
            vsi_nn_client_op.c \
@ -59,12 +63,6 @@ LOCAL_SRC_FILES +=      \
            post/vsi_nn_post_fasterrcnn.c   \
            post/vsi_nn_post_cmupose.c

-LOCAL_SRC_FILES +=      \
-            cpu_backend/vsi_nn_cpu_backend.c   \
-            cpu_backend/vsi_nn_cpu_backend_conv2d.c   \
-            cpu_backend/vsi_nn_cpu_backend_deconv2d.c   \
-            cpu_backend/npuref_interface.c
-

 LOCAL_SRC_FILES += libnnext/vsi_nn_libnnext_resource.c \
                   libnnext/vsi_nn_vxkernel.c
@ -78,11 +76,10 @@ LOCAL_SRC_FILES += kernel/vsi_nn_kernel.c \
                   kernel/vsi_nn_kernel_param.c \
                   kernel/vsi_nn_kernel_gpu_shape_optimize.c \
                   kernel/vsi_nn_kernel_lut.c \
-                   kernel/vsi_nn_spinst.c \
-                   kernel/vsi_nn_sp_unit_operation.c \
-                   kernel/vsi_nn_sp_lut.c \
                   kernel/vsi_nn_gpu.c

+LOCAL_SRC_FILES += vip/virtual_device.cpp
+
 LIBNNEXT_KERNEL_SOURCES := $(wildcard $(LOCAL_PATH)/libnnext/ops/kernel/*.c)
 LOCAL_SRC_FILES += $(LIBNNEXT_KERNEL_SOURCES:$(LOCAL_PATH)/%=%)

@ -117,13 +114,14 @@ LOCAL_C_INCLUDES += \
    $(AQROOT)/sdk/inc/ \
    $(AQROOT)/sdk/inc/HAL \
    $(LOCAL_PATH)/../include \
+    $(LOCAL_PATH)/../include/vip \
    $(LOCAL_PATH)/../include/ops \
    $(LOCAL_PATH)/../include/utils \
    $(LOCAL_PATH)/../include/infernce \
    $(LOCAL_PATH)/../include/client \
-    $(LOCAL_PATH)/../include/cpu_backend \
    $(LOCAL_PATH)/../include/libnnext \
-    $(LOCAL_PATH)/../src
+    $(LOCAL_PATH)/../src \
+    $(LOCAL_PATH)/../src/vip

 LOCAL_CFLAGS :=  \
    -DLINUX \
--- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
@ -22,6 +22,7 @@
 *
 *****************************************************************************/

+#if !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -293,6 +294,16 @@ static vsi_status _query_kernel
    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && input0_dtype == I16)
+    {
+        input0_dtype = I32;
+    }
+
+    if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && input1_dtype == I16)
+    {
+        input1_dtype = I32;
+    }
+
    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && output_dtype == I8)
    {
        output_dtype = BOOL8;
@ -452,3 +463,4 @@ final:
 REGISTER_BACKEND_CL( relational_ops, _setup )

 __END_DECLS
+#endif
--- a/src/tim/vx/internal/src/kernel/cl/crop_and_resize_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/crop_and_resize_cl.c
@ -0,0 +1,359 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+typedef enum _crop_and_resize_type_e
+{
+    nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
+    bilinear = VSI_NN_INTERPOLATION_BILINEAR,
+}crop_and_resize_type_e;
+
+#define _CROP_AND_RESIZE_KERNEL_SOURCE_NAME      "crop_and_resize_"
+
+// Add kernel hashtable here
+#define CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
+        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8) | (RESIZE_METHOD))
+#define CROP_AND_RESIZE_KERNEL( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
+        { CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ), \
+          CVIVANTE_NAMESPACE("cl.crop_and_resize_"#RESIZE_METHOD"_"#IN_DTYPE"to"#OUT_DTYPE), \
+          _CROP_AND_RESIZE_KERNEL_SOURCE_NAME#RESIZE_METHOD }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _crop_and_resize_kernel_map[] =
+{
+    // Register kernel here
+    CROP_AND_RESIZE_KERNEL( U32, U32, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( U32, F32, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( F32, F32, nearest_neighbor),
+    CROP_AND_RESIZE_KERNEL( F32, U32, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( F32, I32, nearest_neighbor),
+    CROP_AND_RESIZE_KERNEL( I32, I32, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( I32, F32, nearest_neighbor),
+
+    CROP_AND_RESIZE_KERNEL( U32, U32, bilinear),
+    CROP_AND_RESIZE_KERNEL( U32, F32, bilinear),
+    CROP_AND_RESIZE_KERNEL( F32, F32, bilinear),
+    CROP_AND_RESIZE_KERNEL( F32, U32, bilinear),
+    CROP_AND_RESIZE_KERNEL( F32, I32, bilinear),
+    CROP_AND_RESIZE_KERNEL( I32, I32, bilinear),
+    CROP_AND_RESIZE_KERNEL( I32, F32, bilinear),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _crop_and_resize_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _CROP_AND_RESIZE_PARAM_NUM  _cnt_of_array( _crop_and_resize_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_crop_and_resize_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+    };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    int32_t       crop_width  = 0;
+    int32_t       crop_height = 0;
+    int32_t       image_width  = 0;
+    int32_t       image_height = 0;
+    int32_t       batch_out = 0;
+    float         width_scale = 0;
+    float         height_scale = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    image_width = (int32_t)(attr[0]->shape->data[0]);
+    image_height = (int32_t)(attr[0]->shape->data[1]);
+    crop_width = (int32_t)(attr[1]->shape->data[0]);
+    crop_height = (int32_t)(attr[1]->shape->data[1]);
+
+    width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
+    height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.global_size[0]   = (crop_width + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0];
+    gpu_param.global_size[1]   = (crop_height + gpu_param.global_scale[1] - 1)
+                                        / gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (batch_out + gpu_param.global_scale[2] - 1)
+                                        / gpu_param.global_scale[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    status = vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
+    status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _crop_and_resize_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t resize_method
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _crop_and_resize_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _crop_and_resize_kernel_map );
+    vx_param_description_t * param_def  = _crop_and_resize_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _crop_and_resize_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in_dtype)
+    {
+        in_dtype = F32;
+    }
+    else if (U8 == in_dtype)
+    {
+        in_dtype = U32;
+    }
+    else if (I8 == in_dtype || I16 == in_dtype)
+    {
+        in_dtype = I32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+    else if (U8 == out_dtype)
+    {
+        out_dtype = U32;
+    }
+    else if (I8 == out_dtype || I16 == out_dtype)
+    {
+        out_dtype = I32;
+    }
+
+    key = CROP_AND_RESIZE_HASH_KEY( in_dtype, out_dtype, resize_method );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _crop_and_resize_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CROP_AND_RESIZE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+    uint32_t ori_depth = (uint32_t)inputs[0]->attr.size[2];
+    uint32_t ori_batchout = (uint32_t)outputs[0]->attr.size[3];
+    float input_zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
+    float inOutScale   = input_scale / output_scale;
+    float inOutTile    = output_zp - inOutScale * input_zp;
+
+    float extrapolation_value = vsi_nn_kernel_param_get_float32( params, "extrapolation_value" );
+    int32_t resize_method = vsi_nn_kernel_param_get_int32( params, "resize_method" );
+
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    shapes[0][0] = inputs[0]->attr.size[0];
+    shapes[0][1] = inputs[0]->attr.size[1];
+    shapes[0][2] = inputs[0]->attr.size[2] * inputs[0]->attr.size[3];
+
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1];
+    shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
+
+    rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
+    rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
+
+    if (rs_input == NULL || rs_output == NULL)
+    {
+        goto final;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, resize_method );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            node_params[0] = rs_input;
+            node_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t);
+            node_params[2] = (vsi_nn_kernel_node_param_t)(inputs[2]->t);
+            node_params[3] = rs_output;
+            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &ori_depth );
+            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &ori_batchout );
+            node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &inOutScale );
+            node_params[7] = vsi_nn_kernel_scalar_create( graph, F32, &inOutTile );
+            node_params[8] = vsi_nn_kernel_scalar_create( graph, F32, &extrapolation_value );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CROP_AND_RESIZE_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+        }
+    }
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( crop_and_resize, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-
+#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -228,4 +228,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( depth2space_internal, _setup )
-
+#endif
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
@ -1,300 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-typedef enum
-{
-    INTERNAL_KERNEL_DETECT_POST_BOX,
-} _internal_kernel_e;
-
-#define _DETECT_POST_BOX_KERNEL_SOURCE      "detect_post_box"
-
-#define STR(a) #a
-// Add kernel hashtable here
-#define DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
-        ((IN0_DTYPE << 18) | ( IN1_DTYPE << 11 ) | ( OUT_DTYPE << 4))
-
-#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
-        { DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
-        CVIVANTE_NAMESPACE("cl.detect_post_box_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
-        _DETECT_POST_BOX_KERNEL_SOURCE}
-
-typedef struct
-{
-    uint32_t key;
-    char * function_name;
-    const char * source_name;
-} _kernel_map_type;
-
-static const _kernel_map_type _detect_post_box_kernel_map[] =
-{
-    // Register kernel here
-    PACK_KERNEL_MAP( F32, F32, F32 ),
-    PACK_KERNEL_MAP( U8,  U8, F32 ),
-};
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _detect_post_box_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _DETECT_POST_BOX_PARAM_NUM  _cnt_of_array( _detect_post_box_kernel_param_def )
-
-#define _DETECT_POST_BOX_F32_PARAM_NUM 8
-
-#define SCALAR_SCALE_Y   (3)
-#define SCALAR_SCALE_X   (4)
-#define SCALAR_SCALE_H   (5)
-#define SCALAR_SCALE_W   (6)
-#define SCALAR_LOG_E     (7)
-#define SCALAR_TAIL0     (8)
-#define SCALAR_TAIL1     (9)
-#define SCALAR_SCALE0    (10)
-#define SCALAR_SCALE1    (11)
-
-/*
- * Kernel initializer
- */
-DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    gpu_param_t gpu_param = {
-        3,
-        {0, 0, 0},
-        {0, 0, 0},
-        {0, 0, 0},
-        {0, 0, 0}
-        };
-    vsi_nn_kernel_tensor_attr_t * input_attr   = NULL;
-    vsi_size_array_t * in_shape                 = NULL;
-
-    VSI_UNREFERENCED(param_size);
-    VSI_UNREFERENCED(node);
-
-    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
-    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
-    in_shape  = input_attr->shape;
-
-    gpu_param.global_scale[0]  = 1;
-    gpu_param.global_scale[1]  = 1;
-    gpu_param.global_scale[2]  = 1;
-
-    gpu_param.dim = 2;
-    gpu_param.global_size[0] = (
-            (in_shape->data[1] + gpu_param.global_scale[0] - 1)
-            / gpu_param.global_scale[0]);
-    gpu_param.global_size[1] = (
-            (in_shape->data[2] + gpu_param.global_scale[1] - 1)
-            / gpu_param.global_scale[1]);
-    gpu_param.global_size[2] = 1;
-    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
-
-final:
-#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
-    SAFE_FREE_TENSOR_ATTR(input_attr);
-
-    return status;
-} /* _detect_post_box_initializer() */
-
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs,
-    vsi_bool *is_use_u8_kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_dtype_e in0_dtype;
-    vsi_nn_kernel_dtype_e in1_dtype;
-    vsi_nn_kernel_dtype_e out_dtype;
-    const _kernel_map_type * kernel_map = _detect_post_box_kernel_map;
-    size_t kernel_map_size              = _cnt_of_array( _detect_post_box_kernel_map );
-    vx_param_description_t * param_def  = _detect_post_box_kernel_param_def;
-    size_t param_def_size               = _cnt_of_array( _detect_post_box_kernel_param_def );
-    vx_kernel_initialize_f  initializer = _detect_post_box_initializer;
-    uint32_t key;
-    uint32_t i;
-
-    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
-    out_dtype  = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-
-
-    if ((U8 == in0_dtype) && (U8 == in1_dtype))
-    {
-        *is_use_u8_kernel = TRUE;
-        param_def_size    = _DETECT_POST_BOX_PARAM_NUM;
-    }
-    else
-    {
-        *is_use_u8_kernel = FALSE;
-        param_def_size    = _DETECT_POST_BOX_F32_PARAM_NUM;
-    }
-
-    key = DETECT_POST_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype );
-
-    for ( i = 0; i < kernel_map_size; i ++ )
-    {
-        if ( kernel_map[i].key == key )
-        {
-            break;
-        }
-    }
-    if ( i < kernel_map_size )
-    {
-        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
-        kernel->info.parameters  = param_def;
-        kernel->info.numParams   = (vx_uint32)param_def_size;
-        kernel->info.initialize  = initializer;
-        // Register code source
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
-                kernel_map[i].source_name );
-        // Register binary source
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                kernel_map[i].source_name );
-        status = VSI_SUCCESS;
-    }
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_DETECT_POST_BOX_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node  = NULL;
-    float                logE  = (float)(log10(exp(1.0f)) / log10(2.0f));
-    float         inv_scale_y  = vsi_nn_kernel_param_get_float32( params, "inv_scale_y" );
-    float         inv_scale_x  = vsi_nn_kernel_param_get_float32( params, "inv_scale_x" );
-    float         inv_scale_h  = vsi_nn_kernel_param_get_float32( params, "inv_scale_h" );
-    float         inv_scale_w  = vsi_nn_kernel_param_get_float32( params, "inv_scale_w" );
-    vsi_bool      is_use_u8_kernel = FALSE;
-    float         input0Scale  = vsi_nn_get_tensor_scale(inputs[0]);
-    float         input0Zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
-    float         input0Tail   = -input0Zp * input0Scale;
-    float         input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
-    float         input1Zp     = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
-    float         input1Tail   = -input1Zp * input1Scale;
-
-    status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel );
-
-    if ( VSI_SUCCESS == status )
-    {
-        size_t node_params_num = _DETECT_POST_BOX_F32_PARAM_NUM;
-
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _DETECT_POST_BOX_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_y );
-            node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_x );
-            node_params[SCALAR_SCALE_H] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_h );
-            node_params[SCALAR_SCALE_W] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_w );
-            node_params[SCALAR_LOG_E]   = vsi_nn_kernel_scalar_create( graph, F32, &logE );
-            if (is_use_u8_kernel)
-            {
-                node_params[SCALAR_TAIL0]   = vsi_nn_kernel_scalar_create( graph, F32, &input0Tail );
-                node_params[SCALAR_TAIL1]   = vsi_nn_kernel_scalar_create( graph, F32, &input1Tail );
-                node_params[SCALAR_SCALE0]  = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
-                node_params[SCALAR_SCALE1]  = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
-                node_params_num = _DETECT_POST_BOX_PARAM_NUM;
-            }
-
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_H] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_W] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_LOG_E] );
-            if (is_use_u8_kernel)
-            {
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL0] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL1] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE0] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE1] );
-            }
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CL( detect_post_box, _setup )
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
@ -1,197 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-#if 0
-/*
- * Define kernel meta.
- */
-typedef enum
-{
-    INTERNAL_KERNEL_DETECT_POST_NMS,
-} _internal_kernel_e;
-
-#define _DETECT_POST_NMS_KERNEL_SOURCE      "detect_post_nms"
-#define _DETECT_POST_NMS_KERNEL_NAME        CVIVANTE_NAMESPACE("cl.detect_post_nms")
-
-// Add kernel hashtable here
-#define DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
-        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
-#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \
-        { DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ), _DETECT_POST_NMS_KERNEL_NAME, SOURCE }
-
-typedef struct
-{
-    uint32_t key;
-    char * function_name;
-    const char * source_name;
-} _kernel_map_type;
-
-static const _kernel_map_type _detect_post_nms_kernel_map[] =
-{
-    // Register kernel here
-    PACK_KERNEL_MAP( F32, F32, _DETECT_POST_NMS_KERNEL_SOURCE ),
-};
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _detect_post_nms_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _DETECT_POST_NMS_PARAM_NUM  _cnt_of_array( _detect_post_nms_kernel_param_def )
-
-#define SCALAR_NMS_TYPE     (6)
-#define SCALAR_MAX_NUM      (7)
-#define SCALAR_MAX_CLASS    (8)
-#define SCALAR_MAX_DETECT   (9)
-#define SCALAR_SCORE_TH     (10)
-#define SCALAR_IOU_TH       (11)
-#define SCALAR_IS_BG        (12)
-
-/*
- * Kernel initializer
- */
-DEF_KERNEL_INITIALIZER(_detect_post_nms_initializer)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-
-    return status;
-} /* _detect_post_nms_initializer() */
-
-
-
-/*
- * Query kernel
- */
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_dtype_e in_dtype;
-    vsi_nn_kernel_dtype_e out_dtype;
-    const _kernel_map_type * kernel_map = _detect_post_nms_kernel_map;
-    size_t kernel_map_size              = _cnt_of_array( _detect_post_nms_kernel_map );
-    vx_param_description_t * param_def  = _detect_post_nms_kernel_param_def;
-    size_t param_def_size               = _cnt_of_array( _detect_post_nms_kernel_param_def );
-    vx_kernel_initialize_f  initializer = _detect_post_nms_initializer;
-
-    uint32_t key;
-    uint32_t i;
-
-    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-
-    key = DETECT_POST_NMS_HASH_KEY( in_dtype, out_dtype );
-
-    for ( i = 0; i < kernel_map_size; i++ )
-    {
-        if ( kernel_map[i].key == key )
-        {
-            break;
-        }
-    }
-    if ( i < kernel_map_size )
-    {
-        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
-        kernel->info.parameters  = param_def;
-        kernel->info.numParams   = param_def_size;
-        kernel->info.initialize  = initializer;
-        // Register code source
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
-                kernel_map[i].source_name );
-        // Register binary source
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                kernel_map[i].source_name );
-        status = VSI_SUCCESS;
-    }
-    return status;
-} /* _query_kernel() */
-#endif
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_nn_kernel_node_t node = NULL;
-
-    VSI_UNREFERENCED(graph);
-    VSI_UNREFERENCED(inputs);
-    VSI_UNREFERENCED(input_num);
-    VSI_UNREFERENCED(outputs);
-    VSI_UNREFERENCED(output_num);
-    VSI_UNREFERENCED(params);
-    VSI_UNREFERENCED(kernel);
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CL( detect_post_nms, _setup )
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@ -60,6 +60,7 @@ typedef enum
    UNARY_ATANH,
    UNARY_ACOSH,
    UNARY_INVERSE_SIGMOID,
+    UNARY_TAN,
 } unary_type_e;

 /*
@ -108,6 +109,7 @@ typedef enum
 #define ATANH_OPERATION         atanh
 #define ACOSH_OPERATION         acosh
 #define INVERSE_SIGMOID_OPERATION inverse_sigmoid
+#define TAN_OPERATION           tan

 #define ADD_UNARY_SH_KERNELS(name) \
    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F32, F32) \
@ -142,6 +144,7 @@ static const struct {
    ADD_UNARY_SH_KERNELS(ATANH)
    ADD_UNARY_SH_KERNELS(ACOSH)
    ADD_UNARY_SH_KERNELS(INVERSE_SIGMOID)
+    ADD_UNARY_SH_KERNELS(TAN)

    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32, I32)
    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32, I32)
@ -166,6 +169,7 @@ static const struct {
 #undef ATANH_OPERATION
 #undef ACOSH_OPERATION
 #undef INVERSE_SIGMOID_OPERATION
+#undef TAN_OPERATION
 /*
 * Kernel params
 */
@ -452,16 +456,22 @@ OnError:
    REGISTER_BACKEND_CL( KERNEL_NAME, _##KERNEL_NAME##_setup )


+#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_CL( sin,          UNARY_SIN )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( cos,          UNARY_COS )
+#endif
+#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_CL( exp,          UNARY_EXP )
+#endif
 REGISTER_ELTWISE_UNARY_BACKEND_CL( log,          UNARY_LOG )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( neg,          UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( mish,         UNARY_MISH )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( round,        UNARY_ROUND )
+#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu,         UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu,    UNARY_HGELU )
+#endif
 REGISTER_ELTWISE_UNARY_BACKEND_CL( selu,         UNARY_SELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( celu,         UNARY_CELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp,          UNARY_RCP )
@ -471,5 +481,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( atan,         UNARY_ATAN )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( atanh,        UNARY_ATANH )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( acosh,        UNARY_ACOSH )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( inverse_sigmoid, UNARY_INVERSE_SIGMOID )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( tan,          UNARY_TAN )

 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-
+#if !(VX_TENSOR_GATHER_API_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -420,3 +420,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( gather, _setup )
+#endif
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
@ -90,6 +90,8 @@ static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    // Add kererl parameters here
 };
 #define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM  _cnt_of_array( _grucell_activation_z_h_kernel_param_def )
@ -97,6 +99,8 @@ static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
 #define SCALAR_INPUT_TAIL       (8)
 #define SCALAR_OUTPUT_SCALE     (9)
 #define SCALAR_OUTPUT_ZP        (10)
+#define SCALAR_OUTPUT1_SCALE    (11)
+#define SCALAR_OUTPUT1_ZP       (12)
 /*
 * Kernel initializer
 */
@ -244,6 +248,8 @@ static vsi_nn_kernel_node_t _setup
    float input_tail = -(float)vsi_nn_get_tensor_zero_point(inputs[GRUCELL_ACT_Z_H_HSTATE]) * input_scale;
    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]);
    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]);
+    float output_scale1 = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]);
+    float output_zp1 = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]);

    if( activation != VSI_NN_ACT_TANH )
    {
@ -268,11 +274,17 @@ static vsi_nn_kernel_node_t _setup
                    graph, F32, &output_scale );
            node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
                    graph, F32, &output_zp );
+            node_params[SCALAR_OUTPUT1_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &output_scale1 );
+            node_params[SCALAR_OUTPUT1_ZP] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &output_zp1 );
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT1_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT1_ZP] );
        }
    }
    return node;
--- a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
@ -46,6 +46,7 @@ typedef enum _grucell_nn_activation_type_e
 {
    SIGMOID = VSI_NN_ACT_SIGMOID,
    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+    RELU = VSI_NN_ACT_RELU,
 }grucell_nn_activation_type_e;

 #define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE      "grucell_reset_after_activation"
@ -71,6 +72,9 @@ static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
    PACK_KERNEL_MAP( U8,  F32, U8,  SIGMOID ),
    PACK_KERNEL_MAP( I32, F32, I32, SIGMOID ),
    PACK_KERNEL_MAP( F32, F32, F32, SIGMOID ),
+    PACK_KERNEL_MAP( U8,  F32, U8,  RELU ),
+    PACK_KERNEL_MAP( I32, F32, I32, RELU ),
+    PACK_KERNEL_MAP( F32, F32, F32, RELU ),
 };


--- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-
+#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -360,3 +360,4 @@ final:
 __END_DECLS

 REGISTER_BACKEND_CL( layer_norm, _setup )
+#endif
--- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
@ -22,6 +22,7 @@
 *
 *****************************************************************************/

+#if !(VX_LOGSOFTMAX_VX_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -34,6 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"

 __BEGIN_DECLS

@ -41,27 +43,30 @@ __BEGIN_DECLS
 /*
 * Define kernel meta.
 */
-#define HASH_LOG_SOFTMAX_KEY(_axis, _input_type, _output_type, _image_2d) \
-    ((_axis << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d))
+#define HASH_LOG_SOFTMAX_KEY(_axis, _input_type, _output_type, _image_2d, exceed_limit) \
+    ((_axis << 24) | (_input_type << 16) | (_output_type << 8) | (_image_2d << 4) | exceed_limit)

 #define VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_axis) \
    "log_softmax_axis"#_axis

+ #define VSI_NN_GEN_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_axis) \
+    "log_softmax_exceed_axis"#_axis
+
 #define HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \
    CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE)

 #define TENSOR_LOG_SOFTMAX_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
        HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
        VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },

 #define TENSOR_LOG_SOFTMAX_FLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
        HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, F32, F32), \
        VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },

 #define TENSOR_LOG_SOFTMAX_BFLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
        HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
        VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },

@ -69,20 +74,28 @@ __BEGIN_DECLS
    CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D")

 #define TENSOR_LOG_SOFTMAX_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
        HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
        VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },

 #define TENSOR_LOG_SOFTMAX_FLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
        HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, F32, F32), \
        VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },

 #define TENSOR_LOG_SOFTMAX_BFLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
        HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
        VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },

+#define HASH_LOG_SOFTMAX_EXCEED_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.log_softmax_exceed_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE)
+
+#define TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 1), \
+        HASH_LOG_SOFTMAX_EXCEED_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
+        VSI_NN_GEN_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(AXIS) },
+
 static const struct {
        uint32_t key;
        char* function_name;
@ -92,31 +105,31 @@ static const struct {
    TENSOR_LOG_SOFTMAX_FLOAT(0, F32, F32)
    TENSOR_LOG_SOFTMAX_FLOAT(1, F32, F32)
    TENSOR_LOG_SOFTMAX_FLOAT(2, F32, F32)
-    TENSOR_LOG_SOFTMAX_FLOAT(0, F16, F16)
-    TENSOR_LOG_SOFTMAX_FLOAT(1, F16, F16)
-    TENSOR_LOG_SOFTMAX_FLOAT(2, F16, F16)
    TENSOR_LOG_SOFTMAX_BFLOAT(0, BF16, BF16)
    TENSOR_LOG_SOFTMAX_BFLOAT(1, BF16, BF16)
    TENSOR_LOG_SOFTMAX_BFLOAT(2, BF16, BF16)

    TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F32, F32)
    TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F32, F32)
-    TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F16, F16)
-    TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F16, F16)
    TENSOR_LOG_SOFTMAX_BFLOAT_2D(0, BF16, BF16)
    TENSOR_LOG_SOFTMAX_BFLOAT_2D(1, BF16, BF16)

-    TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8)
-    TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8)
-    TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8)
    TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8)
    TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8)
    TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8)

    TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8, U8)
    TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8, U8)
-    TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8,  U8)
-    TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8,  U8)
+
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, U8, U8)
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, U8, U8)
+
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, F32, F32)
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, F32, F32)
+
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, BF16)
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, BF16)
+
 };

 /*
@ -198,12 +211,89 @@ final:
    return status;
 } /* _log_softmax_initializer() */

+DEF_KERNEL_INITIALIZER(_log_softmax_exceed_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        2,         // workdim
+        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0}, // localWorkSize: local group size in thread
+        {0, 0, 0}  // globalWorkSize: image size in thread
+        };
+
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+    int32_t axis = 0;
+    int32_t width = 0;
+    int32_t height = 0;
+    int32_t depth = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    out_shape  = attr[1]->shape;
+
+    width = (int32_t)(out_shape->data[0]);
+    height = (int32_t)(out_shape->data[1]);
+    depth = attr[1]->shape->size > 2 ? (int32_t)(out_shape->data[2]) : 1;
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    if (axis == 0)
+    {
+        gpu_param.global_size[0] = 1;
+        gpu_param.global_size[1] = depth;
+    }
+    else
+    {
+        gpu_param.global_size[0] = width;
+        gpu_param.global_size[1] = 1;
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    if (axis == 0)
+    {
+        status |= vsi_nn_kernel_gpu_add_param( node, "width", &width );
+    }
+    else
+    {
+        status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth );
+    }
+    status |= vsi_nn_kernel_gpu_add_param( node, "height", &height );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+    }
+
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+    }
+
+    return status;
+}
+
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
    vsi_nn_tensor_t* const* const outputs,
    int32_t axis,
    vsi_bool image_2d,
+    vsi_bool exceed_limit,
    vsi_nn_kernel_t* kernel
    )
 {
@ -215,7 +305,17 @@ static vsi_status _query_kernel

    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    key = HASH_LOG_SOFTMAX_KEY( axis, input_dtype, output_dtype, image_2d );
+
+    if (input_dtype == F16)
+    {
+        input_dtype = F32;
+    }
+    if (output_dtype == F16)
+    {
+        output_dtype = F32;
+    }
+    if (exceed_limit) image_2d = vx_false_e;
+    key = HASH_LOG_SOFTMAX_KEY( axis, input_dtype, output_dtype, image_2d, exceed_limit );

    for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
    {
@ -229,7 +329,14 @@ static vsi_status _query_kernel
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters = kernel_param_def;
        kernel->info.numParams = _cnt_of_array( kernel_param_def );
-        kernel->info.initialize = _log_softmax_initializer;
+        if (exceed_limit)
+        {
+            kernel->info.initialize = _log_softmax_exceed_initializer;
+        }
+        else
+        {
+            kernel->info.initialize = _log_softmax_initializer;
+        }
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
                kernel_map[i].source_name );
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
@ -254,7 +361,14 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
    vsi_bool image_2d = FALSE;
    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    uint32_t rank_in = 0;
    int32_t axis = 0;
+    int32_t new_axis = 0;
+    vsi_bool ret = vx_false_e;
+    vsi_bool exceed_limit = vx_false_e;
+    uint32_t i   = 0;
    float beta = 0;
    float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
    float outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
@ -270,16 +384,37 @@ static vsi_nn_kernel_node_t _setup
    scaleValue = scaleValue * beta * inputScale;
    beta = beta * inputScale;

-    if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
-                inputs[0]->attr.dim_num )
-     || axis > 2)
+    if (inputs[0]->attr.size[axis] >= GPU_TENSOR_MAX_WIDTH)
+    {
+        exceed_limit = vx_true_e;
+    }
+
+    ret = vsi_nn_kernel_optimize_softmax_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+            shapes[0], &rank_in, &new_axis);
+
+    if (ret)
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], rank_in );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[0], rank_in );
+    }
+    else
    {
        return NULL;
    }

-    image_2d = ((inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1)
-        && axis != 2);
-    status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
+    if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
+                reshape_tensors[0]->attr.dim_num )
+     || new_axis > 2 || (new_axis == 2 && exceed_limit))
+    {
+        return NULL;
+    }
+
+    image_2d = ((reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1)
+        && new_axis != 2);
+    status = _query_kernel( inputs, outputs, new_axis, image_2d, exceed_limit, kernel );
    if( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
@ -287,10 +422,10 @@ static vsi_nn_kernel_node_t _setup
        if( node )
        {
            vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
-                    inputs, 1, outputs, 1 );
+                    reshape_tensors, 1, &reshape_tensors[1], 1 );

            node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
+                    graph, I32, &new_axis );
            node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create(
                    graph, F32, &beta );
            node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
@ -311,9 +446,16 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
        }
    }
+
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
    return node;
 } /* _setup() */

 __END_DECLS

 REGISTER_BACKEND_CL( log_softmax, _setup )
+#endif
--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@ -75,6 +75,9 @@ __BEGIN_DECLS
 #define HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
    CVIVANTE_NAMESPACE("cl.gemm_4x_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)

+#define HASH_MATRIXMUL_4X_TRANSA_LOCAL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
+    CVIVANTE_NAMESPACE("cl.gemm_4x_transa_local_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
+
 #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0, 0), \
        HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
@ -90,6 +93,11 @@ __BEGIN_DECLS
        HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
        SOURCE },

+#define TENSOR_MATRIXMUL_4X_TRANSA_LOCAL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
+    {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2, 1, 0), \
+        HASH_MATRIXMUL_4X_TRANSA_LOCAL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
+        SOURCE },
+
 #define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
    {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 1, 0), \
        HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
@ -142,6 +150,7 @@ static const struct {
    TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D,      KERNEL_SOURCE_3)
    TENSOR_MATRIXMUL_4X_KERNELS(F32, F32, F32, _2D,         KERNEL_SOURCE_4)
    TENSOR_MATRIXMUL_4X_TRANSA_KERNELS(F32, F32, F32, _2D,  KERNEL_SOURCE_4)
+    TENSOR_MATRIXMUL_4X_TRANSA_LOCAL_KERNELS(F32, F32, F32, _2D,  KERNEL_SOURCE_4)
 };

 /*
@ -313,6 +322,49 @@ final:
    return status;
 } /* _matrixmul_4x_initializer() */

+DEF_KERNEL_INITIALIZER(_matrixmul_4x_local_initializer)
+(vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t* param,
+ size_t param_size) {
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+
+    vsi_nn_kernel_tensor_attr_t* attr = NULL;
+    vsi_size_t width = 0;
+
+
+    VSI_UNREFERENCED(param_size);
+
+    attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
+    CHECK_PTR_FAIL_GOTO(attr, "Create tensor attr buffer fail.", final);
+
+    width = attr->shape->data[0];
+
+    gpu_param.dim = 2;
+    gpu_param.local_size[0] = 1;
+    gpu_param.local_size[1] = 64;
+    gpu_param.local_size[2] = 1;
+
+    gpu_param.global_scale[0] = 16;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.global_size[0] =
+        (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];
+    gpu_param.global_size[1] = 64;
+    gpu_param.global_size[2] = 1;
+
+    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr) {
+        vsi_nn_kernel_tensor_attr_release(&attr);
+        attr = NULL;
+    }
+    return status;
+} /* _matrixmul_4x_local_initializer() */
+
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
@ -403,7 +455,10 @@ static vsi_status _query_kernel
            kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def );
        }

-        if (flag_4x) {
+        if ((flag_4x == 2) && (transa == 1)) {
+            kernel->info.initialize = _matrixmul_4x_local_initializer;
+        }
+        else if (flag_4x == 1) {
            kernel->info.initialize = _matrixmul_4x_initializer;
        } else {
            kernel->info.initialize = _matrixmul_initializer;
@ -471,6 +526,7 @@ static vsi_nn_kernel_node_t _setup
    uint32_t stride_axis_in_out[9] = {0};
    vsi_nn_tensor_t* tmp_inputs[2]  = {NULL};
    vsi_nn_tensor_t* tmp_outputs[1] = {NULL};
+    vsi_bool shader_cnt_support = FALSE;

    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
@ -585,7 +641,20 @@ static vsi_nn_kernel_node_t _setup
            rs_out_tensors = vsi_nn_reshape_tensor(graph, tmp_outputs[0], final_shape, final_rank);
            final_out_tensors[0] = rs_out_tensors;

-            flag_4x = 1;
+
+#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
+            shader_cnt_support =
+                (graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE;
+#endif
+            if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
+            {
+                flag_4x = 2;
+            }
+            else
+            {
+                flag_4x = 1;
+            }
+
        }
    }

--- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
@ -246,28 +246,49 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
    vsi_bool image_2d = FALSE;
    vsi_nn_kernel_node_t node = NULL;
-
-    float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
-    float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
-    float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
-    float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
-    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
-    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0_scale;
+    float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1_tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1_scale;
+    float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;

    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    VSI_UNREFERENCED(params);

-    outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
+    output_scale = vsi_abs(output_scale) < 1e-5 ? 0.0f : 1.0f / output_scale;

-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if (ret == FALSE)
    {
-        return NULL;
+        goto final;
    }

-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( inputs, outputs, image_2d, kernel );
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            inputs[1], shapes[1], new_rank );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shapes[2], new_rank );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
+    }
+
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2);
+    status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
@ -275,19 +296,19 @@ static vsi_nn_kernel_node_t _setup
        if ( node )
        {
            vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
-                    inputs, 2, outputs, 1 );
+                    reshape_tensors, 2, &reshape_tensors[2], 1 );
            node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input0Scale );
+                    graph, F32, &input0_scale );
            node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input0Tail );
+                    graph, F32, &input0_tail );
            node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input1Scale );
+                    graph, F32, &input1_scale );
            node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input1Tail );
+                    graph, F32, &input1_tail );
            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &outputScale );
+                    graph, F32, &output_scale );
            node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &outputZP );
+                    graph, F32, &output_zp );

            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
@ -300,6 +321,12 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
        }
    }
+
+final:
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(reshape_tensors[2]);
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
@ -246,29 +246,49 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
    vsi_bool image_2d = FALSE;
    vsi_nn_kernel_node_t node = NULL;
-
-    float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
-    float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
-    float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
-    float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
-    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
-    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0_scale;
+    float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1_tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1_scale;
+    float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;

    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    VSI_UNREFERENCED(params);

+    output_scale = vsi_abs(output_scale) < 1e-5 ? 0.0f : 1.0f / output_scale;

-    outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );

-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    if (ret == FALSE)
    {
-        return NULL;
+        goto final;
    }

-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( inputs, outputs, image_2d, kernel );
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            inputs[1], shapes[1], new_rank );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shapes[2], new_rank );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
+    }
+
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2);
+    status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
@ -276,19 +296,19 @@ static vsi_nn_kernel_node_t _setup
        if ( node )
        {
            vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
-                    inputs, 2, outputs, 1 );
+                    reshape_tensors, 2, &reshape_tensors[2], 1 );
            node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input0Scale );
+                    graph, F32, &input0_scale );
            node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input0Tail );
+                    graph, F32, &input0_tail );
            node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input1Scale );
+                    graph, F32, &input1_scale );
            node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input1Tail );
+                    graph, F32, &input1_tail );
            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &outputScale );
+                    graph, F32, &output_scale );
            node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &outputZP );
+                    graph, F32, &output_zp );

            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
@ -301,6 +321,12 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
        }
    }
+
+final:
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(reshape_tensors[2]);
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
@ -22,6 +22,7 @@
 *
 *****************************************************************************/

+#if !(VX_TENSOR_POW_API_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -294,4 +295,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( pow, _setup )
-
+#endif
--- a/src/tim/vx/internal/src/kernel/cl/resize_cubic_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_cubic_cl.c
@ -0,0 +1,320 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define _RESIZE_CUBIC_KERNEL_SOURCE()      "resize_cubic"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) )
+
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+          CVIVANTE_NAMESPACE("cl.resize_cubic_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          _RESIZE_CUBIC_KERNEL_SOURCE() }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _resize_cubic_kernel_map[] =
+{
+    PACK_KERNEL_MAP( F32, F32),
+    PACK_KERNEL_MAP( U8,  U8),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _resize_cubic_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define SCALAR_SCALE_X         (2)
+#define SCALAR_SCALE_Y         (3)
+#define SCALAR_HALF_PIXEL      (4)
+#define SCALAR_INPUT_SCALE     (5)
+#define SCALAR_INPUT_TAIL      (6)
+#define SCALAR_OUTPUT_SCALE    (7)
+#define SCALAR_OUTPUT_TAIL     (8)
+
+
+#define RESIZE_CUBIC_NUM         5
+#define RESIZE_CUBIC_QUANT_NUM   _cnt_of_array( _resize_cubic_kernel_param_def )
+
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_resize_cubic_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_size_array_t * out_shape                 = NULL;
+
+    VSI_UNREFERENCED(param_size);
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape  = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _resize_cubic_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool *is_use_u8_kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _resize_cubic_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _resize_cubic_kernel_map );
+    vx_param_description_t * param_def  = _resize_cubic_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _resize_cubic_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _resize_cubic_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in_dtype)
+    {
+        in_dtype = F32;
+    }
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+
+    if ((U8 == in_dtype) || (U8 == out_dtype))
+    {
+        param_def_size = RESIZE_CUBIC_QUANT_NUM;
+        *is_use_u8_kernel = TRUE;
+    }
+    else
+    {
+        param_def_size = RESIZE_CUBIC_NUM;
+        *is_use_u8_kernel = FALSE;
+    }
+
+    key = RESIZE_CUBIC_HASH_KEY( in_dtype, out_dtype );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[RESIZE_CUBIC_QUANT_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+    vsi_size_t in_width     = inputs[0]->attr.size[0];
+    vsi_size_t in_height    = inputs[0]->attr.size[1];
+    vsi_size_t out_width    = outputs[0]->attr.size[0];
+    vsi_size_t out_height   = outputs[0]->attr.size[1];
+    float   input_zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float   input_tail   = -(input_zp * input_scale);
+    float   output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float   half_pixel_value = 0.0f;
+    float   scale_factor_x = 0.0f;
+    float   scale_factor_y = 0.0f;
+    vsi_bool is_use_u8_kernel = FALSE;
+
+    if (align_corners && out_width > 1)
+    {
+        scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
+    }
+    else
+    {
+        scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
+    }
+
+    if (align_corners && out_height > 1)
+    {
+        scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
+    }
+    else
+    {
+        scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
+    }
+
+    if (half_pixel_centers)
+    {
+        half_pixel_value = 0.5f;
+    }
+    else
+    {
+        half_pixel_value = 0.0f;
+    }
+
+
+    status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel );
+    if (VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            size_t node_params_num = RESIZE_CUBIC_NUM;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, RESIZE_CUBIC_QUANT_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_SCALE_X]    = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x );
+            node_params[SCALAR_SCALE_Y]    = vsi_nn_kernel_scalar_create(graph, F32, &scale_factor_y );
+            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value );
+            if (is_use_u8_kernel)
+            {
+                node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+                node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input_tail );
+                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &output_zp );
+                node_params_num = RESIZE_CUBIC_QUANT_NUM;
+            }
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
+            if (is_use_u8_kernel)
+            {
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
+            }
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( resize_cubic, _setup )
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_reduction_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_reduction_cl.c
@ -0,0 +1,727 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+typedef enum
+{
+    NONE = 0,
+    Add,
+    Mul,
+    Max,
+    Min
+} vsi_scatter_nd_update_type_e;
+
+/*
+ * Define kernel meta.
+ */
+#define KERNEL_SOURCE_1    "scatter_nd_update_reduction"
+#define KERNEL_SOURCE_2    "scatter_nd_update_reduction_conv"
+
+#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _stage, _op) \
+    ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_stage << 4) | (_op))
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_preprocess_"#SRC0_TYPE)
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, SRC2_TYPE) \
+    CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_"#REDUCTION_TYPE"_"#SRC2_TYPE)
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_conv_"#DST_TYPE)
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(IN0_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, 0, 0, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(IN0_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(REDUCTION_TYPE, IN2_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, REDUCTION_TYPE), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, IN2_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 2, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(OUT_TYPE), \
+        SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type scatter_nd_update_reduction_preprocess_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F32,  KERNEL_SOURCE_1)
+};
+
+static const _kernel_map_type scatter_nd_update_reduction_process_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F32,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F32,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F32,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F32,  KERNEL_SOURCE_1)
+};
+
+static const _kernel_map_type scatter_nd_update_reduction_conv_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(U8,   KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I8,   KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I16,  KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F16,  KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F32,  KERNEL_SOURCE_2)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _scatter_nd_update_preprocess_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _scatter_nd_update_process_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _scatter_nd_update_conv_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+#define _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM  _cnt_of_array(_scatter_nd_update_preprocess_kernel_param_def)
+#define _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM  _cnt_of_array(_scatter_nd_update_process_kernel_param_def)
+#define _SCATTER_ND_UPDATE_CONV_PARAM_NUM  _cnt_of_array(_scatter_nd_update_conv_kernel_param_def)
+
+static vsi_status cal_scatter_nd_update_tensor_reshape_size
+    (
+    vsi_nn_tensor_t ** inputs,
+    vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
+    uint32_t block_size,
+    uint32_t coordDim,
+    vsi_size_t strides[VSI_NN_MAX_DIM_NUM],
+    int32_t* newDim
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    uint32_t dims_num = inputs[0]->attr.dim_num;
+    vsi_size_t *input_size = inputs[0]->attr.size;
+    uint32_t i = 0;
+    vsi_size_t elementCnt = 1;
+
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
+
+    newDim[0] = 0;
+    for (i = 0; i < dims_num; ++i)
+    {
+        elementCnt *= input_size[i];
+    }
+
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    {
+        sizes[i] = 1;
+    }
+
+    sizes[0] = block_size;
+    sizes[1] = elementCnt / block_size;
+    newDim[0] = 2;
+
+    if (coordDim == 1 && strides) // index shape
+    {
+        for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+        {
+            strides[i] = 0;
+        }
+    }
+    else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides)
+    {
+        for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+        {
+            strides[i] = 0;
+        }
+
+        strides[0] = input_size[dims_num - coordDim];
+        for (i = 1; i < coordDim - 1; i++)
+        {
+            strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i];
+        }
+    }
+
+#undef VSI_NN_MAX_IMAGE_WIDTH
+
+    return status;
+} /* cal_scatter_nd_update_tensor_reshape_size */
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_reduction_preprocess_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        1,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t     width         = 0;
+    int32_t     element_size  = 1;
+    int32_t     i             = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
+    {
+        element_size *= (int32_t)attr[0]->shape->data[i];
+    }
+    width = element_size / 8;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    if (element_size < 8)
+    {
+        gpu_param.global_size[0]   = element_size;
+    }
+    else
+    {
+        gpu_param.global_size[0]   = width;
+    }
+    gpu_param.global_size[1]   = 1;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_reduction_preprocess_initializer() */
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_process_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    int32_t     block_size = 1;
+    int32_t     index_num  = 1;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    block_size = (int32_t)(attr[1]->shape->data[0]);
+    index_num = (int32_t)(attr[0]->shape->data[1]);
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = block_size;
+    gpu_param.global_size[1]   = index_num;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_process_initializer() */
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_conv_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        1,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t     width         = 0;
+    int32_t     element_size  = 1;
+    int32_t     i             = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
+    {
+        element_size *= (int32_t)attr[0]->shape->data[i];
+    }
+    width = element_size / 8;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    if (element_size < 8)
+    {
+        gpu_param.global_size[0]   = element_size;
+    }
+    else
+    {
+        gpu_param.global_size[0]   = width;
+    }
+    gpu_param.global_size[1]   = 1;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_conv_initializer() */
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel_preprocess,
+    vsi_nn_kernel_t* kernel_process,
+    vsi_nn_kernel_t* kernel_conv,
+    int32_t reduction_flg
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e input2_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    size_t i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, 0, 0, 0, 0 );
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_preprocess_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map) )
+    {
+        snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s",
+                    scatter_nd_update_reduction_preprocess_map[i].function_name );
+        kernel_preprocess->info.parameters = _scatter_nd_update_preprocess_kernel_param_def;
+        kernel_preprocess->info.numParams = _cnt_of_array( _scatter_nd_update_preprocess_kernel_param_def );
+        kernel_preprocess->info.initialize = _scatter_nd_update_reduction_preprocess_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                scatter_nd_update_reduction_preprocess_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_preprocess_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, reduction_flg );
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_process_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_process_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_process_map) )
+    {
+        snprintf( kernel_process->info.name, VX_MAX_KERNEL_NAME, "%s",
+                    scatter_nd_update_reduction_process_map[i].function_name );
+        kernel_process->info.parameters = _scatter_nd_update_process_kernel_param_def;
+        kernel_process->info.numParams = _cnt_of_array( _scatter_nd_update_process_kernel_param_def );
+        kernel_process->info.initialize = _scatter_nd_update_process_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                scatter_nd_update_reduction_process_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_process_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 2, 0 );
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_conv_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_conv_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_conv_map) )
+    {
+        snprintf( kernel_conv->info.name, VX_MAX_KERNEL_NAME, "%s",
+                    scatter_nd_update_reduction_conv_map[i].function_name );
+        kernel_conv->info.parameters = _scatter_nd_update_conv_kernel_param_def;
+        kernel_conv->info.numParams = _cnt_of_array( _scatter_nd_update_conv_kernel_param_def );
+        kernel_conv->info.initialize = _scatter_nd_update_conv_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                scatter_nd_update_reduction_conv_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_conv_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_size_t  strides[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t coord_strides[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
+    int32_t coord_dim  = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
+    int32_t reduction  = vsi_nn_kernel_param_get_int32( params, "reduction" );
+    int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
+    float input_zp   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float input_zp_scale = 0 - input_zp * input_scale;
+    float update_zp   = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
+    float update_scale  = vsi_nn_get_tensor_scale(inputs[2]);
+    float update_zp_scale = 0 - update_zp * update_scale;
+    float output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    vsi_nn_tensor_t * tensors[2] = { NULL };
+    vsi_nn_kernel_t * ikernels[2] = { NULL };
+    int32_t i = 0;
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = cal_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0,
+                                                    NULL, &rs_idx_dim);
+    status |= cal_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0,
+                                                    NULL, &rs_in_dim);
+    status |= cal_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
+                                                    strides, &rs_out_dim);
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    coord_strides[coord_dim - 1] = 1;
+    for (i = 0; i < coord_dim - 1; i++)
+    {
+        coord_strides[i] = (int32_t)strides[coord_dim - 2 - i];
+    }
+
+    {
+        vsi_nn_tensor_attr_t attr;
+        vsi_nn_kernel_node_t preprocess_node = NULL;
+        vsi_nn_kernel_node_t process_node = NULL;
+        vsi_nn_kernel_node_param_t preprocess_params[_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t process_params[_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t conv_params[_SCATTER_ND_UPDATE_CONV_PARAM_NUM] = { NULL };
+        int32_t width = 1;
+        int32_t res = 0;
+        int32_t update_width = (int32_t)shapes[1][0];
+        int32_t output_width = (int32_t)shapes[2][0];
+
+        ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
+        ikernels[0]->unique_id = kernel->unique_id;
+        ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
+        ikernels[1]->unique_id = kernel->unique_id;
+
+        memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
+        attr.dtype = outputs[0]->attr.dtype;
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+        attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+
+        for (i = 0; i < rs_out_dim; i++)
+        {
+            attr.size[i] = shapes[2][i];
+            width *= (int32_t)shapes[2][i];
+        }
+        attr.dim_num = rs_out_dim;
+
+        res = width % 8;
+        width = (width >> 3) << 3;
+
+        tensors[0] = vsi_nn_CreateTensor( graph, &attr );  // ref'
+        attr.size[0] = 1;
+        attr.size[1] = 1;
+        attr.dim_num = rs_out_dim;
+        tensors[1] = vsi_nn_CreateTensor( graph, &attr );  // link_buffer0
+
+        status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], kernel, reduction);
+        if ( VSI_SUCCESS == status)
+        {
+            // convert ref to float
+            preprocess_node = vsi_nn_kernel_create_node( graph, ikernels[0] );
+            if (preprocess_node)
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                preprocess_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[2], rs_out_dim );
+                preprocess_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp_scale );
+                status = vsi_nn_kernel_node_pass_param( preprocess_node, preprocess_params,
+                            _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &preprocess_params[0] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[2] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[3] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[4] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[5] );
+            }
+
+            // update
+            process_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
+            if (process_node)
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[0], rs_idx_dim );
+                process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t,  shapes[1], rs_in_dim );
+                process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[0] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[1] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[2] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[3] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[4] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[5] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[6] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &update_width );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_width );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &update_scale );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &update_zp_scale );
+                status = vsi_nn_kernel_node_pass_param( process_node, process_params,
+                                _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &process_params[0] );
+                vsi_nn_kernel_tensor_release( &process_params[1] );
+                vsi_nn_kernel_scalar_release( &process_params[4] );
+                vsi_nn_kernel_scalar_release( &process_params[5] );
+                vsi_nn_kernel_scalar_release( &process_params[6] );
+                vsi_nn_kernel_scalar_release( &process_params[7] );
+                vsi_nn_kernel_scalar_release( &process_params[8] );
+                vsi_nn_kernel_scalar_release( &process_params[9] );
+                vsi_nn_kernel_scalar_release( &process_params[10] );
+                vsi_nn_kernel_scalar_release( &process_params[11] );
+                vsi_nn_kernel_scalar_release( &process_params[12] );
+                vsi_nn_kernel_scalar_release( &process_params[13] );
+                vsi_nn_kernel_scalar_release( &process_params[14] );
+                vsi_nn_kernel_scalar_release( &process_params[15] );
+            }
+
+            // convert float to output
+            node = vsi_nn_kernel_create_node( graph, kernel );
+            if ( node )
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                conv_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+                status = vsi_nn_kernel_node_pass_param( node, conv_params, _SCATTER_ND_UPDATE_CONV_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &conv_params[2] );
+                vsi_nn_kernel_scalar_release( &conv_params[3] );
+                vsi_nn_kernel_scalar_release( &conv_params[4] );
+                vsi_nn_kernel_scalar_release( &conv_params[5] );
+                vsi_nn_kernel_scalar_release( &conv_params[6] );
+            }
+        }
+
+        if (preprocess_node) {vsi_nn_kernel_node_release( &preprocess_node );}
+        if (process_node) {vsi_nn_kernel_node_release( &process_node );}
+    }
+
+final:
+    if (ikernels[0])
+    {
+        vsi_nn_kernel_release(&ikernels[0]);
+    }
+    if (ikernels[1])
+    {
+        vsi_nn_kernel_release(&ikernels[1]);
+    }
+    vsi_safe_release_tensor(tensors[0]);
+    vsi_safe_release_tensor(tensors[1]);
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( scatter_nd_update_reduction, _setup )
--- a/src/tim/vx/internal/src/kernel/cl/select_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/select_cl.c
@ -22,6 +22,7 @@
 *
 *****************************************************************************/

+#if !(VX_TENSOR_SELECT_VX_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -359,3 +360,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( select, _setup )
+#endif
--- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
@ -22,6 +22,7 @@
 *
 *****************************************************************************/

+#if !(VX_TENSOR_TILE_API_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -445,3 +446,4 @@ final:
 __END_DECLS

 REGISTER_BACKEND_CL( tile, _setup )
+#endif
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@ -438,7 +438,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
    int32_t width = (int32_t)block_size;
    int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
-    int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
+    int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
    vsi_bool is_odd_even_sort = FALSE;
    size_t param_num = _TOPK_PARAM_NUM;
    float inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
--- a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
@ -106,14 +106,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
    vsi_nn_kernel_dtype_e  output_dtype  = F16;
    vsi_nn_kernel_tensor_attr_t *input0_attr = NULL, *input1_attr = NULL, *output_attr = NULL;
    vsi_size_array_t             *input_shape  = NULL;
-    float   scaleIn     = 1.0f;
-    int32_t input_ZP    = 0;
-    float   scaleIn1    = 1.0f;
-    int32_t input_ZP1   = 0;
-    float   scaleOut    = 1.0f;
-    int32_t output_ZP   = 0;
-    int32_t fixpoint = 0, fixpoint1 = 0, fixpoint_out = 0;
-    float   inScale_dfp, inScale_dfp1;
+    float   scaleIn    = 1.0f;
+    int32_t input_ZP   = 0;
+    float   scaleIn1   = 1.0f;
+    int32_t input_ZP1  = 0;
+    float   scaleOut   = 1.0f;
+    int32_t output_ZP  = 0;
    float   eps        = 0.0f;
    float   rsEps      = 0.0f;
    float   dimRatio   = 0.0f;
@ -135,80 +133,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
    rsEps    = (float)(1.0f / sqrtf(eps));
    dimRatio = (float)(1.0 / (input_shape->data[0]));

-
-    if ( VSI_NN_KERNEL_QUANT_DFP == input0_attr->quant )
-    {
-        fixpoint   = input0_attr->dfp.fl;
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input0_attr->quant )
-    {
-        input_ZP   = input0_attr->asymm.zero_point;
-        scaleIn    = input0_attr->asymm.scale;
-    }
-    else
-    {
-        input_ZP   = 0;
-        scaleIn    = 1.0f;
-    }
-
-    //input1
-    if ( VSI_NN_KERNEL_QUANT_DFP == input1_attr->quant )
-    {
-        fixpoint1  = input1_attr->dfp.fl;
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant )
-    {
-        input_ZP1  = input1_attr->asymm.zero_point;
-        scaleIn1   = input1_attr->asymm.scale;
-    }
-    else
-    {
-        input_ZP1   = 0;
-        scaleIn1    = 1.0f;
-    }
-
-    //output
-    if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant )
-    {
-        fixpoint_out = output_attr->dfp.fl;
-        if (fixpoint_out >= 0)
-        {
-            scaleOut = 1.0f / (vx_float32) ((int64_t)1 << fixpoint_out);
-        }
-        else
-        {
-            scaleOut = (vx_float32) ((int64_t)1 << -fixpoint_out);
-        }
-        output_ZP = 0;
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_ZP  = output_attr->asymm.zero_point;
-        scaleOut   = output_attr->asymm.scale;
-    }
-    else
-    {
-        output_ZP   = 0;
-        scaleOut    = 1.0f;
-    }
-
-    if (fixpoint >= 0)
-    {
-        inScale_dfp = 1.0f / (vx_float32) ((int64_t)1 << fixpoint);
-    }
-    else
-    {
-        inScale_dfp = (vx_float32) ((int64_t)1 << -fixpoint);
-    }
-
-    if (fixpoint1 >= 0)
-    {
-        inScale_dfp1 = 1.0f / (vx_float32) ((int64_t)1 << fixpoint1);
-    }
-    else
-    {
-        inScale_dfp1 = (vx_float32) ((int64_t)1 << -fixpoint1);
-    }
+    scaleIn   = input0_attr->scale;
+    input_ZP  = input0_attr->zero_point;
+    scaleIn1  = input1_attr->scale;
+    input_ZP1 = input1_attr->zero_point;
+    scaleOut  = output_attr->scale;
+    output_ZP = output_attr->zero_point;

    gpu_param.global_offset[0] = 0;
    gpu_param.global_offset[1] = 0;
@ -349,8 +279,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
                       &uniConvertInt16ScaleToFp32Fst_4x4);
        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16ScaleToFp32Sec_4x4",
                       &uniConvertInt16ScaleToFp32Sec_4x4);
-        status |= vsi_nn_kernel_gpu_add_param(node, "inScale_i16", &inScale_dfp);
-        status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &inScale_dfp1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "inScale_i16", &scaleIn);
+        status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &scaleIn1);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
    width   = (int32_t)input_shape->data[0];
--- a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
@ -215,41 +215,11 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer)
    output_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );

-    if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = input_attr->dfp.fl;
-        if (fl > 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input_scale = input_attr->asymm.scale;
-        input_tail  = 0 - input_scale * (float)input_attr->asymm.zero_point;
-    }
+    input_scale = input_attr->scale;
+    input_tail  = 0 - input_scale * (float)input_attr->zero_point;

-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = output_attr->dfp.fl;
-        if (fl > 0)
-        {
-            output_scale = (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            output_scale = 1.0f / (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_scale = 1.0f / output_attr->asymm.scale;
-        output_zp    = (float)output_attr->asymm.zero_point;
-    }
+    output_scale = 1.0f / output_attr->scale;
+    output_zp    = (float)output_attr->zero_point;

    pack_key = _PACK_BATCH_NORM_KEY( input_attr->dtype, output_attr->dtype );

--- a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
@ -121,23 +121,20 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
    vsi_nn_kernel_dtype_e output_dtype = F16;

    uint32_t depth = 0;
-    float half_input0_wh[2];
-    float add_float_value[2];
-    uint32_t in0_width;
-    uint32_t in0_height;
-    uint32_t out_width;
-    uint32_t out_height;
-    int32_t align_corners;
+    float half_input0_wh[2]  = {0};
+    float add_float_value[2] = {0};
+    uint32_t in0_width    = 0;
+    uint32_t in0_height   = 0;
+    uint32_t out_width    = 0;
+    uint32_t out_height   = 0;
+    int32_t align_corners = 0;

-    int32_t src0FixPointPos = 0;
-    int32_t src1FixPointPos = 0;
-    int32_t dstFixPointPos  = 0;
-    float   input0_scale    = 1.0;
-    int32_t input0ZP        = 0;
-    float   input1_scale    = 1.0;
-    int32_t input1ZP        = 0;
-    float   output_scale    = 1.0;
-    int32_t outputZP        = 0;
+    float   input0_scale  = 1.0;
+    int32_t input0ZP      = 0;
+    float   input1_scale  = 1.0;
+    int32_t input1ZP      = 0;
+    float   output_scale  = 1.0;
+    int32_t outputZP      = 0;

    VSI_UNREFERENCED(param_size);

@ -165,54 +162,14 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
    input1_dtype = input_attr[1]->dtype;
    output_dtype = output_attr->dtype;

-    if (U8 == input0_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant) {
-        input0_scale = input_attr[0]->asymm.scale;
-        input0ZP     = input_attr[0]->asymm.zero_point;
-    } else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant) {
-        src0FixPointPos = input_attr[0]->dfp.fl;
-        if (src0FixPointPos >= 0) {
-            input0_scale = 1.0f / (float)((int64_t)1 << src0FixPointPos);
-        } else if (src0FixPointPos < 0) {
-            input0_scale = (float)((int64_t)1 << -src0FixPointPos);
-        }
-        input0ZP = 0;
-    } else {
-        input0_scale = 1.0f;
-        input0ZP     = 0;
-    }
+    input0_scale = input_attr[0]->scale;
+    input0ZP     = input_attr[0]->zero_point;

-    if (U8 == input1_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[1]->quant) {
-        input1_scale = input_attr[1]->asymm.scale;
-        input1ZP     = input_attr[1]->asymm.zero_point;
-    } else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[1]->quant) {
-        src1FixPointPos = input_attr[1]->dfp.fl;
-        if (src1FixPointPos >= 0) {
-            input1_scale = 1.0f / (float)((int64_t)1 << src1FixPointPos);
-        } else if (src1FixPointPos < 0) {
-            input1_scale = (float)((int64_t)1 << -src1FixPointPos);
-        }
-        input1ZP = 0;
-    } else {
-        input1_scale = 1.0f;
-        input1ZP     = 0;
-    }
-
-    if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) {
-        output_scale = output_attr->asymm.scale;
-        outputZP = output_attr->asymm.zero_point;
-    } else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) {
-        dstFixPointPos = output_attr->dfp.fl;
-        if (dstFixPointPos >= 0) {
-            output_scale = (float)((int64_t)1 << dstFixPointPos);
-        } else if (dstFixPointPos < 0) {
-            output_scale = 1.0f / (float)((int64_t)1 << -dstFixPointPos);
-        }
-        outputZP = 0;
-    } else {
-        output_scale = 1.0;
-        outputZP = 0;
-    }
+    input1_scale = input_attr[1]->scale;
+    input1ZP     = input_attr[1]->zero_point;

+    output_scale = output_attr->scale;
+    outputZP     = output_attr->zero_point;

    in0_width  = (uint32_t)(in0_shape->data[0]);
    in0_height = (uint32_t)(in0_shape->data[1]);
@ -496,7 +453,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
                    I16 == output_dtype)) ||
                   ((I8 == input0_dtype && I8 == input1_dtype &&
                     I8 == output_dtype))) {
-            float dfpScale = input0_scale * output_scale;
+            float dfpScale = input0_scale / output_scale;
            gpu_dp_inst_t uniDFPtoFp32_part0_4x4 = {{
                0x01010101, // TCfg
                0x00000000, // ASelt
--- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
@ -179,7 +179,6 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
            / gpu_param.global_scale[1]);
    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;

-
    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
    {
        srcFixPointPos   = input_attr->dfp.fl;
--- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
@ -22,6 +22,7 @@
 *
 *****************************************************************************/

+#if !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -319,41 +320,10 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)

    out_shape  = attr[2]->shape;

-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input0Scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input0Scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input0Scale  = attr[0]->asymm.scale;
-        input0Tail = 0 - attr[0]->asymm.zero_point * input0Scale;
-    }
-
-    if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            input1Scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input1Scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input1Scale  = attr[1]->asymm.scale;
-        input1Tail = 0 - attr[1]->asymm.zero_point * input1Scale;
-    }
+    input0Scale = attr[0]->scale;
+    input0Tail  = 0 - attr[0]->zero_point * input0Scale;
+    input1Scale = attr[1]->scale;
+    input1Tail  = 0 - attr[1]->zero_point * input1Scale;

    gpu_param.global_scale[0]  = 8;
    gpu_param.global_scale[1]  = 1;
@ -616,3 +586,4 @@ final:
 REGISTER_BACKEND_EVIS( relational_ops, _setup )

 __END_DECLS
+#endif
--- a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
@ -152,23 +152,12 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer)
    out_shape    = output_attr->shape;
    weight_shape = weights_attr->shape;

-    if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input_ZP          = input_attr->asymm.zero_point;
-        scaleIn           = input_attr->asymm.scale;
-    }
-
-    if ( VSI_NN_KERNEL_QUANT_ASYMM == weights_attr->quant )
-    {
-        weight_ZP         = weights_attr->asymm.zero_point;
-        scaleWights       = weights_attr->asymm.scale;
-    }
-
-    if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_ZP         = (float)output_attr->asymm.zero_point;
-        scaleOut          = output_attr->asymm.scale;
-    }
+    input_ZP     = input_attr->zero_point;
+    scaleIn      = input_attr->scale;
+    weight_ZP    = weights_attr->zero_point;
+    scaleWights  = weights_attr->scale;
+    output_ZP    = (float)output_attr->zero_point;
+    scaleOut     = output_attr->scale;

    scaleOut     = (scaleIn * scaleWights) / scaleOut;
    input_height = (int32_t)(in_shape->data[1]);
--- a/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c
@ -0,0 +1,540 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+__BEGIN_DECLS
+
+typedef enum _crop_and_resize_type_e
+{
+    nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
+    bilinear = VSI_NN_INTERPOLATION_BILINEAR,
+}crop_and_resize_type_e;
+
+#define _CROP_AND_RESIZE_KERNEL_SOURCE_NAME      "crop_and_resize_"
+
+// Add kernel hashtable here
+#define CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
+        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8) | (RESIZE_METHOD))
+#define CROP_AND_RESIZE_KERNEL( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
+        { CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ), \
+          CVIVANTE_NAMESPACE("evis.crop_and_resize_"#RESIZE_METHOD"_"#IN_DTYPE"to"#OUT_DTYPE), \
+          _CROP_AND_RESIZE_KERNEL_SOURCE_NAME#RESIZE_METHOD }
+
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _crop_and_resize_kernel_map[] =
+{
+    // Register kernel here
+    CROP_AND_RESIZE_KERNEL( U8,  U8,  nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( U8,  F16, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( F16, F16, nearest_neighbor),
+    CROP_AND_RESIZE_KERNEL( F16, U8,  nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( F16, I8,  nearest_neighbor),
+    CROP_AND_RESIZE_KERNEL( I8,  I8,  nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( I8,  F16, nearest_neighbor),
+    CROP_AND_RESIZE_KERNEL( I16, I16, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( I16, F16, nearest_neighbor),
+
+    CROP_AND_RESIZE_KERNEL( U8,  U8,  bilinear),
+    CROP_AND_RESIZE_KERNEL( U8,  F16, bilinear),
+    CROP_AND_RESIZE_KERNEL( F16, F16, bilinear),
+    CROP_AND_RESIZE_KERNEL( F16, U8,  bilinear),
+    CROP_AND_RESIZE_KERNEL( F16, I8,  bilinear),
+    CROP_AND_RESIZE_KERNEL( I8,  I8,  bilinear),
+    CROP_AND_RESIZE_KERNEL( I8,  F16, bilinear),
+    CROP_AND_RESIZE_KERNEL( I16, I16, bilinear),
+    CROP_AND_RESIZE_KERNEL( I16, F16, bilinear),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _crop_and_resize_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _CROP_AND_RESIZE_PARAM_NUM  _cnt_of_array( _crop_and_resize_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_crop_and_resize_nearest_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+    };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    int32_t       crop_width  = 0;
+    int32_t       crop_height = 0;
+    int32_t       image_width  = 0;
+    int32_t       image_height = 0;
+    int32_t       batch_out = 0;
+    float         width_scale = 0;
+    float         height_scale = 0;
+    float         src0ZP     = 0;
+    float         src0Scale  = 1;
+    float         dstZP      = 0;
+    float         dstScale   = 1;
+    float         inOutScale = 0;
+    float         inOutTile  = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    src0Scale = attr[0]->scale;
+    src0ZP = (float)attr[0]->zero_point;
+
+    dstScale = attr[1]->scale;
+    dstZP = (float)attr[1]->zero_point;
+
+    inOutScale = src0Scale / dstScale;
+    inOutTile = dstZP - inOutScale * src0ZP;
+
+    image_width = (int32_t)(attr[0]->shape->data[0]);
+    image_height = (int32_t)(attr[0]->shape->data[1]);
+    crop_width = (int32_t)(attr[1]->shape->data[0]);
+    crop_height = (int32_t)(attr[1]->shape->data[1]);
+
+    width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
+    height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
+
+    gpu_param.global_scale[0] = 8;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.global_size[0]   = gpu_align_p2((crop_width + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0], 8);
+    gpu_param.global_size[1]   = (crop_height + gpu_param.global_scale[1] - 1)
+                                        / gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (batch_out + gpu_param.global_scale[2] - 1)
+                                        / gpu_param.global_scale[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    {
+        gpu_dp_inst_t uniExtract8Bit_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertFstToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniConvertSecToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "inOutTile", &inOutTile );
+        status |= vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
+        status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _crop_and_resize_nearest_initializer() */
+
+DEF_KERNEL_INITIALIZER(_crop_and_resize_bilinear_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+    };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    int32_t       crop_width  = 0;
+    int32_t       crop_height = 0;
+    int32_t       image_width  = 0;
+    int32_t       image_height = 0;
+    int32_t       batch_out = 0;
+    float         width_scale = 0;
+    float         height_scale = 0;
+    float         src0ZP     = 0;
+    float         src0Scale  = 1;
+    float         dstZP      = 0;
+    float         dstScale   = 1;
+    float         inOutScale = 0;
+    float         inOutTile  = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    src0Scale = attr[0]->scale;
+    src0ZP = (float)attr[0]->zero_point;
+
+    dstScale = attr[1]->scale;
+    dstZP = (float)attr[1]->zero_point;
+
+    inOutScale = src0Scale / dstScale;
+    inOutTile = dstZP - inOutScale * src0ZP;
+
+    image_width = (int32_t)(attr[0]->shape->data[0]);
+    image_height = (int32_t)(attr[0]->shape->data[1]);
+    crop_width = (int32_t)(attr[1]->shape->data[0]);
+    crop_height = (int32_t)(attr[1]->shape->data[1]);
+
+    width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
+    height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
+
+    gpu_param.global_scale[0] = 4;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.global_size[0]   = gpu_align_p2((crop_width + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (crop_height + gpu_param.global_scale[1] - 1)
+                                        / gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (batch_out + gpu_param.global_scale[2] - 1)
+                                        / gpu_param.global_scale[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    {
+        gpu_dp_inst_t uniExtract8Bit_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniRightToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00030001, 0x00070005, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniLeftToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001,
+            0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniRightToFp32_4x4", &uniRightToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFp32_4x4", &uniLeftToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "inOutTile", &inOutTile );
+        status |= vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
+        status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _crop_and_resize_bilinear_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t resize_method
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _crop_and_resize_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _crop_and_resize_kernel_map );
+    vx_param_description_t * param_def  = _crop_and_resize_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _crop_and_resize_nearest_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (resize_method == bilinear)
+    {
+        initializer = _crop_and_resize_bilinear_initializer;
+    }
+    key = CROP_AND_RESIZE_HASH_KEY( in_dtype, out_dtype, resize_method );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _crop_and_resize_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CROP_AND_RESIZE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+    uint32_t ori_depth = (uint32_t)inputs[0]->attr.size[2];
+    uint32_t ori_batchout = (uint32_t)outputs[0]->attr.size[3];
+    float extrapolation_value = vsi_nn_kernel_param_get_float32( params, "extrapolation_value" );
+    int32_t resize_method = vsi_nn_kernel_param_get_int32( params, "resize_method" );
+
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    shapes[0][0] = inputs[0]->attr.size[0];
+    shapes[0][1] = inputs[0]->attr.size[1];
+    shapes[0][2] = inputs[0]->attr.size[2] * inputs[0]->attr.size[3];
+
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1];
+    shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
+
+    rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
+    rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
+
+    if (rs_input == NULL || rs_output == NULL)
+    {
+        goto final;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, resize_method );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            node_params[0] = rs_input;
+            node_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t);
+            node_params[2] = (vsi_nn_kernel_node_param_t)(inputs[2]->t);
+            node_params[3] = rs_output;
+            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &ori_depth );
+            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &ori_batchout );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CROP_AND_RESIZE_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+        }
+        {
+            // Set default border mode.
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+            vsi_nn_Float32ToDtype(extrapolation_value, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
+            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( crop_and_resize, _setup )
+
--- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
@ -204,39 +204,11 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse);
    CHECK_STATUS_FAIL_GOTO(status, OnError );

-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            input_scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            input_scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input_scale = attr[0]->asymm.scale;
-        input_zp = attr[0]->asymm.zero_point;
-    }
+    input_scale = attr[0]->scale;
+    input_zp = attr[0]->zero_point;

-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            output_scale = (float)((int64_t)1 << attr[1]->dfp.fl);
-        }
-        else
-        {
-            output_scale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
-        }
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_scale = 1.0f / attr[1]->asymm.scale;
-        output_zp = (float)attr[1]->asymm.zero_point;
-    }
+    output_scale = 1.0f / attr[1]->scale;
+    output_zp = (float)attr[1]->zero_point;

    in_out_scale = input_scale * output_scale;
    in_out_zp_scale = (float)in_out_scale * input_zp * (-1);
--- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-
+#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -161,51 +161,10 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size);
    CHECK_STATUS_FAIL_GOTO(status, OnError );

-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        src0ZP     = attr[0]->asymm.zero_point;
-        src0Scale  = attr[0]->asymm.scale;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        src0ZP = 0;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        src0Scale = 1;
-        src0ZP = 0;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        dstZP      = attr[1]->asymm.zero_point;
-        dstScale   = attr[1]->asymm.scale;
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << attr[1]->dfp.fl));
-        }
-        else
-        {
-            dstScale = (float)((int64_t)1 << -attr[1]->dfp.fl);
-        }
-        dstZP = 0;
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
-        dstZP = 0;
-    }
+    src0ZP     = attr[0]->zero_point;
+    src0Scale  = attr[0]->scale;
+    dstZP      = attr[1]->zero_point;
+    dstScale   = attr[1]->scale;

    output_dims = (uint32_t)attr[1]->shape->size;
    output_width = (int32_t)(attr[1]->shape->data[0]);
@ -454,4 +413,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_EVIS( depth2space_internal, _setup )
-
+#endif
--- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
@ -250,12 +250,12 @@ DEF_KERNEL_INITIALIZER(_depthwise_conv1d_initializer)
    gpu_param.global_size[1]   = gpu_align_p2((output_shape->data[1] + gpu_param.global_scale[1] - 1)
                                             / gpu_param.global_scale[1], gpu_param.local_size[1]);

-    outputScale = input_attr->asymm.scale;
+    outputScale = input_attr->scale;

-    outputScale *= weight_attr->asymm.scale;
-    weightZP     = weight_attr->asymm.zero_point;
-    outputScale /= output_attr->asymm.scale;
-    outputZP = (float)output_attr->asymm.zero_point + 0.5f;
+    outputScale *= weight_attr->scale;
+    weightZP     = weight_attr->zero_point;
+    outputScale /= output_attr->scale;
+    outputZP = (float)output_attr->zero_point + 0.5f;

 #define _PACK_SELECT_KEY( kernel_size, dilation, evis_version )    \
        ((uint64_t)kernel_size | ((uint64_t)dilation << 16) | ((uint64_t)evis_version << 32))
--- a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
@ -135,17 +135,10 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
    status  = vsi_nn_kernel_gpu_add_param( node, "logE", &logE);
    CHECK_STATUS_FAIL_GOTO(status, final );

-    if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input0_ZP         = input_attr->asymm.zero_point;
-        scaleIn0          = input_attr->asymm.scale;
-    }
-
-    if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant )
-    {
-        input1_ZP         = input1_attr->asymm.zero_point;
-        scaleIn1          = input1_attr->asymm.scale;
-    }
+    input0_ZP = input_attr->zero_point;
+    scaleIn0  = input_attr->scale;
+    input1_ZP = input1_attr->zero_point;
+    scaleIn1  = input1_attr->scale;

    if ((F32 == input_attr->dtype) || (F32 == input1_attr->dtype))
    {
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@ -60,6 +60,7 @@ typedef enum
    UNARY_ATANH,
    UNARY_ACOSH,
    UNARY_INVERSE_SIGMOID,
+    UNARY_TAN,
 } unary_type_e;

 /*
@ -108,6 +109,7 @@ typedef enum
 #define ATANH_OPERATION         atanh
 #define ACOSH_OPERATION         acosh
 #define INVERSE_SIGMOID_OPERATION inverse_sigmoid
+#define TAN_OPERATION           tan

 #define ADD_UNARY_SH_KERNELS(name, source) \
    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_3D) \
@ -153,6 +155,7 @@ static const struct {
    ADD_UNARY_SH_KERNELS(ATAN,      KERNEL_SOURCE1)
    ADD_UNARY_SH_KERNELS(ATANH,     KERNEL_SOURCE1)
    ADD_UNARY_SH_KERNELS(ACOSH,     KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(TAN,       KERNEL_SOURCE1)

    ADD_UNARY_SH_KERNELS(HSIGMOID,  KERNEL_SOURCE0)
    ADD_UNARY_SH_KERNELS(MISH,      KERNEL_SOURCE0)
@ -177,6 +180,7 @@ static const struct {
 #undef RCP_OPERATION
 #undef SIGN_OPERATION
 #undef SOFTSIGN_OPERATION
+#undef TAN_OPERATION
 /*
 * Kernel params
 */
@ -243,41 +247,10 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
    }
    out_shape  = attr[1]->shape;

-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale  = attr[0]->asymm.scale;
-        inputTail = 0 - attr[0]->asymm.zero_point * inputScale;
-    }
-
-    if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << fl);
-        }
-        else
-        {
-            outputScale = (float)1.0f / (float) ((int64_t)1 << -fl);
-        }
-    }
-    else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale = (float)1.0f / attr[1]->asymm.scale;
-        outputZP     = (float)attr[1]->asymm.zero_point;
-    }
+    inputScale  = attr[0]->scale;
+    inputTail   = 0 - attr[0]->zero_point * inputScale;
+    outputScale = (float)1.0f / attr[1]->scale;
+    outputZP    = (float)attr[1]->zero_point;

 #define _PACK_SELECT_KEY( TYPE, IN_TYPE, OUT_TYPE )    \
        (( TYPE << 24) | ( IN_TYPE << 16) | ( OUT_TYPE << 8))
@ -298,17 +271,23 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)

    switch( pack_key )
    {
+#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
        case _PACK_SELECT_KEY( UNARY_SIN, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_COS, BF16, BF16 ):
+#endif
+#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
        case _PACK_SELECT_KEY( UNARY_EXP, BF16, BF16 ):
+#endif
        case _PACK_SELECT_KEY( UNARY_LOG, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_SELU, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_NEG, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ):
+#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
        case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ):
+#endif
        case _PACK_SELECT_KEY( UNARY_CELU, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_RCP, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_SIGN, BF16, BF16 ):
@ -317,6 +296,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
        case _PACK_SELECT_KEY( UNARY_ATANH, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_ACOSH, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_INVERSE_SIGMOID, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_TAN, BF16, BF16 ):
        {
            gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
                0x11111111, // TCfg
@ -614,16 +594,22 @@ OnError:
    } \
    REGISTER_BACKEND_EVIS( KERNEL_NAME, _##KERNEL_NAME##_setup )

+#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sin, UNARY_SIN )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( cos, UNARY_COS )
+#endif
+#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( exp, UNARY_EXP )
+#endif
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( log, UNARY_LOG )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND )
+#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU )
+#endif
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( selu, UNARY_SELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( rcp, UNARY_RCP )
@ -633,5 +619,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atan, UNARY_ATAN )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atanh, UNARY_ATANH )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( acosh, UNARY_ACOSH )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( inverse_sigmoid, UNARY_INVERSE_SIGMOID )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( tan, UNARY_TAN )

 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/evis/erf_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/erf_evis.c
@ -145,41 +145,10 @@ DEF_KERNEL_INITIALIZER(_erf_initializer)

    out_shape  = attr[1]->shape;

-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale  = attr[0]->asymm.scale;
-        inputTail = 0 - attr[0]->asymm.zero_point * inputScale;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << fl);
-        }
-        else
-        {
-            outputScale = (float)1.0f / (float) ((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale = (float)1.0f / attr[1]->asymm.scale;
-        outputZP     = (float)attr[1]->asymm.zero_point;
-    }
+    inputScale  = attr[0]->scale;
+    inputTail   = 0 - (float)attr[0]->zero_point * inputScale;
+    outputScale = (float)1.0f / attr[1]->scale;
+    outputZP    = (float)attr[1]->zero_point;

 #define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE )    \
        ( ( IN_TYPE << 16) | ( OUT_TYPE << 8))
--- a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
@ -129,9 +129,6 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
    vsi_size_array_t             *output_shape = NULL;
    vsi_nn_kernel_dtype_e        input0_dtype = F16;
-    int32_t                      input0_fl    = 0;
-    int32_t                      input1_fl    = 0;
-    int32_t                      output_fl    = 0;
    float                        inScale0     = 1.0f;
    float                        inScale1     = 1.0f;
    float                        outScale     = 1.0f;
@ -169,59 +166,12 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
                                 (output_shape->data[2] + gpu_param.global_scale[2] - 1)
                                             / gpu_param.global_scale[2] : 1;

-    if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input0_fl = input0_attr->dfp.fl;
-        if (input0_fl > 0)
-        {
-            inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
-        }
-        else
-        {
-            inScale0 = (float)((int64_t)1 << -input0_fl);
-        }
-    }
-    else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inScale0   = input0_attr->asymm.scale;
-        in0Tail    = -inScale0 * ((float)input0_attr->asymm.zero_point);
-    }
-
-    if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input1_fl = input1_attr->dfp.fl;
-        if (input1_fl > 0)
-        {
-            inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
-        }
-        else
-        {
-            inScale1 = (float)((int64_t)1 << -input1_fl);
-        }
-    }
-    else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inScale1   = input1_attr->asymm.scale;
-        in1Tail    = -inScale1 * ((float)input1_attr->asymm.zero_point);
-    }
-
-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outScale = 1.0f / (float)((int64_t)1 << -output_fl);
-        }
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outScale    = 1.0f / output_attr->asymm.scale;
-        outZp       = (float)(output_attr->asymm.zero_point);
-    }
+    inScale0 = input0_attr->scale;
+    in0Tail  = 0 - inScale0 * ((float)input0_attr->zero_point);
+    inScale1 = input1_attr->scale;
+    in1Tail  = 0 - inScale1 * ((float)input1_attr->zero_point);
+    outScale = 1.0f / output_attr->scale;
+    outZp    = (float)(output_attr->zero_point);

    if (BF16 == input0_dtype)
    {
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-
+#if !(VX_TENSOR_GATHER_API_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -202,6 +202,7 @@ static vx_param_description_t _gather_kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    // Add kererl parameters here
 };
 #define _GATHER_PARAM_NUM  _cnt_of_array( _gather_kernel_param_def )
@ -285,6 +286,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
    int32_t       indices_num = 1;
    uint32_t      input_dims1 = 0;
    int32_t       batch       = 1;
+    int32_t       is_array    = 0;
    vx_uint32     i           = 0;
    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
    vsi_size_array_t * input1_shape = NULL;
@ -308,40 +310,13 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &is_array);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );

-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        src0Scale = attr[0]->asymm.scale;
-        src0ZP = attr[0]->asymm.zero_point;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        dstScale = 1.0f / attr[2]->asymm.scale;
-        dstZP = attr[2]->asymm.zero_point;
-    }
+    src0Scale = attr[0]->scale;
+    src0ZP    = attr[0]->zero_point;
+    dstScale  = 1.0f / attr[2]->scale;
+    dstZP     = attr[2]->zero_point;

    input1_shape  = attr[1]->shape;
    input_dims1   = (uint32_t)input1_shape->size;
@ -358,8 +333,16 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
    }
    shaderParam.global_scale[1]  = 1;
    shaderParam.global_scale[2]  = 1;
-    shaderParam.global_size[0]   = gpu_align_p2((block_size + shaderParam.global_scale[0] - 1)
-        / shaderParam.global_scale[0], 4);
+    if (is_array)
+    {
+         shaderParam.global_size[0]   = (block_size + shaderParam.global_scale[0] - 1)
+             / shaderParam.global_scale[0];
+    }
+    else
+    {
+        shaderParam.global_size[0]   = gpu_align_p2((block_size + shaderParam.global_scale[0] - 1)
+             / shaderParam.global_scale[0], 4);
+    }
    shaderParam.global_size[1]   = indices_num;
    shaderParam.global_size[2]   = block_num;

@ -508,39 +491,10 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
    CHECK_STATUS_FAIL_GOTO(status, OnError );

-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        src0Scale = attr[0]->asymm.scale;
-        src0ZP = attr[0]->asymm.zero_point;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        dstScale = 1.0f / attr[2]->asymm.scale;
-        dstZP = attr[2]->asymm.zero_point;
-    }
+    src0Scale = attr[0]->scale;
+    src0ZP    = attr[0]->zero_point;
+    dstScale  = 1.0f / attr[2]->scale;
+    dstZP     = attr[2]->zero_point;

    input1_shape  = attr[1]->shape;
    input_dims1   = (uint32_t)input1_shape->size;
@ -661,8 +615,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
    {
        status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
    }
-    status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
-    status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
+    if (indices_num > GPU_TENSOR_MAX_WIDTH || block_num > GPU_TENSOR_MAX_WIDTH)
+    {
+        status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
+        status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
+    }
    CHECK_STATUS_FAIL_GOTO(status, OnError );

 OnError:
@ -841,6 +798,7 @@ static vsi_nn_kernel_node_t _setup
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is_array );
            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _GATHER_PARAM_NUM );
            vsi_nn_kernel_scalar_release( &tmp_params[3] );
            vsi_nn_kernel_scalar_release( &tmp_params[4] );
@ -859,3 +817,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_EVIS( gather, _setup )
+#endif
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@ -290,39 +290,10 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size);
    CHECK_STATUS_FAIL_GOTO(status, OnError );

-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        src0Scale = attr[0]->asymm.scale;
-        src0ZP = attr[0]->asymm.zero_point;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        dstScale = 1.0f / attr[2]->asymm.scale;
-        dstZP = attr[2]->asymm.zero_point;
-    }
+    src0Scale = attr[0]->scale;
+    src0ZP    = attr[0]->zero_point;
+    dstScale  = 1.0f / attr[2]->scale;
+    dstZP     = attr[2]->zero_point;

    indices_num = (int32_t)(attr[1]->shape->data[1]);
    batch_num = (int32_t)(attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1);
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c
@ -238,7 +238,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
    float    tensorZP[4]                    = {0.0f, 0.0f, 0.0f, 0.0f};
    uint32_t  i                             = 0;
    uint32_t  pack_key                      = 0;
-    vsi_size_array_t * output_shape          = NULL;
+    vsi_size_array_t * output_shape         = NULL;
    vsi_nn_kernel_tensor_attr_t * attr[4]   = { NULL, NULL, NULL, NULL };

    VSI_UNREFERENCED(param_size);
@ -254,12 +254,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)

    for (i = 0; i < 4; i++)
    {
-        if( attr[i]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-            || attr[i]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-        {
-            tensorZP[i]     = (float)attr[i]->asymm.zero_point;
-            tensorScale[i]  = attr[i]->asymm.scale;
-        }
+        tensorZP[i]     = (float)attr[i]->zero_point;
+        tensorScale[i]  = attr[i]->scale;
    }

    tensorZP[0] = tensorScale[0] * tensorZP[0];
@ -459,63 +455,31 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_cdnn_initializer)

    output_shape  = attr[3]->shape;

-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-     || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        input_scale  = attr[0]->asymm.scale;
-        input_tail   = 0 - input_scale * (float)attr[0]->asymm.zero_point;
-    }
+    input_scale   = attr[0]->scale;
+    input_tail    = 0 - input_scale * (float)attr[0]->zero_point;

-    if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-     || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        input_r_scale  = attr[1]->asymm.scale;
-        input_r_tail   = 0 - input_r_scale * (float)attr[1]->asymm.zero_point;
-    }
+    input_r_scale = attr[1]->scale;
+    input_r_tail  = 0 - input_r_scale * (float)attr[1]->zero_point;

-    if( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-     || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        recur_r_scale  = attr[2]->asymm.scale;
-        recur_r_tail   = 0 - recur_r_scale * (float)attr[2]->asymm.zero_point;
-    }
+    recur_r_scale = attr[2]->scale;
+    recur_r_tail  = 0 - recur_r_scale * (float)attr[2]->zero_point;

-    if( attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-     || attr[3]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        output_scale  = 1.0f / attr[3]->asymm.scale;
-        output_zp   = (float)attr[3]->asymm.zero_point;
-    }
+    output_scale  = 1.0f / attr[3]->scale;
+    output_zp     = (float)attr[3]->zero_point;

    if ( param_size == _GRUCELL_CDNN_SEP_ACTIVATION_PARAM_NUM )
    {
-        if( attr[4]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-         || attr[4]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-        {
-            input_z_scale  = attr[4]->asymm.scale;
-            input_z_tail   = 0 - input_z_scale * (float)attr[4]->asymm.zero_point;
-        }
+        input_z_scale  = attr[4]->scale;
+        input_z_tail   = 0 - input_z_scale * (float)attr[4]->zero_point;

-        if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-         || attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-        {
-            recur_z_scale  = attr[5]->asymm.scale;
-            recur_z_tail   = 0 - recur_z_scale * (float)attr[5]->asymm.zero_point;
-        }
+        recur_z_scale  = attr[5]->scale;
+        recur_z_tail   = 0 - recur_z_scale * (float)attr[5]->zero_point;

-        if( attr[6]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-         || attr[6]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-        {
-            input_c_scale  = attr[6]->asymm.scale;
-            input_c_tail   = 0 - input_c_scale * (float)attr[6]->asymm.zero_point;
-        }
+        input_c_scale  = attr[6]->scale;
+        input_c_tail   = 0 - input_c_scale * (float)attr[6]->zero_point;

-        if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-         || attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-        {
-            recur_c_scale  = attr[7]->asymm.scale;
-            recur_c_tail   = 0 - recur_c_scale * (float)attr[7]->asymm.zero_point;
-        }
+        recur_c_scale  = attr[7]->scale;
+        recur_c_tail   = 0 - recur_c_scale * (float)attr[7]->zero_point;
    }

    if (layer_out == 1 || layer_out == 2)
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
    float                        hstate_in_tail         = 0;
    float                        output_scale           = 1.0f;
    float                        output_zp              = 0;
+    float                        output_scale1          = 1.0f;
+    float                        output_zp1             = 0;
    uint32_t                     i                      = 0;
    uint32_t                     pack_key               = 0;
    vsi_nn_kernel_tensor_attr_t* input_attr[GRUCELL_ACT_Z_H_IN_CNT] = {NULL};
@ -142,33 +144,14 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
    output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out );
    CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final );

-    if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
-    {
-        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
-        if (srcFixPointPos >= 0)
-            hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        else if (srcFixPointPos < 0)
-            hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
-    {
-        hstate_in_scale = input_attr[0]->asymm.scale;
-        hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
-    }
+    hstate_in_scale = input_attr[0]->scale;
+    hstate_in_tail = -(float)input_attr[0]->zero_point * hstate_in_scale;

-    if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant )
-    {
-        int8_t dstFixPointPos = (int8_t)output_attr[0]->dfp.fl;
-        if (dstFixPointPos >= 0)
-            output_scale *= (vx_float32)((int64_t)1 << dstFixPointPos);
-        else if (dstFixPointPos < 0)
-            output_scale *= 1.0f / (vx_float32) ((int64_t)1 << - dstFixPointPos);
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
-    {
-        output_scale = 1.0f / output_attr[0]->asymm.scale;
-        output_zp = (float)output_attr[0]->asymm.zero_point;
-    }
+    output_scale = 1.0f / output_attr[0]->scale;
+    output_zp = (float)output_attr[0]->zero_point;
+
+    output_scale1 = 1.0f / output_attr[1]->scale;
+    output_zp1 = (float)output_attr[1]->zero_point;

    pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);

@ -290,6 +273,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
            status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_tail", &hstate_in_tail);
            status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
            status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+            status |= vsi_nn_kernel_gpu_add_param(node, "output_scale1", &output_scale1);
+            status |= vsi_nn_kernel_gpu_add_param(node, "output_zp1", &output_zp1);
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        break;
--- a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
@ -132,19 +132,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
    output_attr[0] = vsi_nn_kernel_tensor_attr_create( output );
    CHECK_PTR_FAIL_GOTO( output_attr[0], "Create tensor attr buffer fail.", final );

-    if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
-    {
-        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
-        if (srcFixPointPos >= 0)
-            hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        else if (srcFixPointPos < 0)
-            hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
-    {
-        hstate_in_scale = input_attr[0]->asymm.scale;
-        hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
-    }
+    hstate_in_scale = input_attr[0]->scale;
+    hstate_in_tail = 0 - (float)input_attr[0]->zero_point * hstate_in_scale;

    pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);

--- a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
@ -47,6 +47,7 @@ typedef enum _grucell_nn_activation_type_e
    SIGMOID = VSI_NN_ACT_SIGMOID,
    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
    TANH = VSI_NN_ACT_TANH,
+    RELU = VSI_NN_ACT_RELU,
 }grucell_nn_activation_type_e;

 #define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE      "grucell_reset_after_activation"
@ -80,6 +81,11 @@ static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
    PACK_KERNEL_MAP( I16,  F16,  I16,  SIGMOID, SIGMOID ),
    PACK_KERNEL_MAP( F16,  F16,  F16,  SIGMOID, SIGMOID ),
    PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, SIGMOID ),
+    PACK_KERNEL_MAP( U8,   F16,  U8,   SIGMOID, RELU ),
+    PACK_KERNEL_MAP( I8,   F16,  I8,   SIGMOID, RELU ),
+    PACK_KERNEL_MAP( I16,  F16,  I16,  SIGMOID, RELU ),
+    PACK_KERNEL_MAP( F16,  F16,  F16,  SIGMOID, RELU ),
+    PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, RELU ),
 };


@ -148,33 +154,11 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
    output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out );
    CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final );

-    if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
-    {
-        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
-        if (srcFixPointPos >= 0)
-            hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        else if (srcFixPointPos < 0)
-            hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
-    {
-        hstate_in_scale = input_attr[0]->asymm.scale;
-        hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
-    }
+    hstate_in_scale = input_attr[0]->scale;
+    hstate_in_tail = -(float)input_attr[0]->zero_point * hstate_in_scale;

-    if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant )
-    {
-        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
-        if (srcFixPointPos >= 0)
-            output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos);
-        else if (srcFixPointPos < 0)
-            output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos);
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
-    {
-        output_scale = 1.0f / output_attr[0]->asymm.scale;
-        output_zp = (float)output_attr[0]->asymm.zero_point;
-    }
+    output_scale = 1.0f / output_attr[0]->scale;
+    output_zp = (float)output_attr[0]->zero_point;

    pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);

--- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
@ -127,10 +127,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
    vsi_size_array_t * output_shape             = NULL;
    vsi_nn_kernel_dtype_e input_dtype          = F16;
    vsi_nn_kernel_dtype_e output_dtype         = F16;
-    int32_t   input_fl      = 0;
    int32_t   inputZP       = 0;
    float     inputScale    = 1.0f;
-    int32_t   output_fl     = 0;
    int32_t   outputZP      = 0;
    float     outputScale   = 1.0f;
    float     r_inputScale  = 1.0f;
@ -153,41 +151,11 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
    input_dtype   = input_attr->dtype;
    output_dtype  = output_attr->dtype;

-    if ( VSI_NN_KERNEL_QUANT_DFP == input_attr->quant )
-    {
-        input_fl   = input_attr->dfp.fl;
-        if (input_fl >= 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
-        }
-        else
-        {
-            inputScale = (float) ((int64_t)1 << -input_fl);
-        }
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        inputZP     = input_attr->asymm.zero_point;
-        inputScale  = input_attr->asymm.scale;
-    }
+    inputZP      = input_attr->zero_point;
+    inputScale   = input_attr->scale;

-    if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant )
-    {
-        output_fl   = output_attr->dfp.fl;
-        if (output_fl >= 0)
-        {
-            outputScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = 1.0f / (float) ((int64_t)1 << -output_fl);
-        }
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        outputZP     = output_attr->asymm.zero_point;
-        outputScale  = 1.0f / output_attr->asymm.scale;
-    }
+    outputZP     = output_attr->zero_point;
+    outputScale  = 1.0f / output_attr->scale;

    e2InScale    = inputScale * inputScale;
    r_inputScale = 1.0f / inputScale;
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@ -22,6 +22,7 @@
 *
 *****************************************************************************/

+#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -42,7 +43,11 @@ __BEGIN_DECLS
 #define SOURCE_AXIS0_1     "layer_normalization_1"
 #define SOURCE_AXIS0_2     "layer_normalization_2"
 #define SOURCE_AXIS0_3     "layer_normalization_3"
-#define SOURCE_AXIS01      "layer_normalization_axis01"
+#define SOURCE_AXIS01_SUM    "layer_normalization_axis01_sum"
+#define SOURCE_AXIS01_0      "layer_normalization_axis01_0"
+#define SOURCE_AXIS01_1      "layer_normalization_axis01_1"
+#define SOURCE_AXIS01_2      "layer_normalization_axis01_2"
+#define SOURCE_AXIS01_3      "layer_normalization_axis01_3"

 #define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, SCALE_TYPE, DST_TYPE) \
    CVIVANTE_NAMESPACE("evis.layer_norm_axis0_"#SRC0_TYPE"_"#SCALE_TYPE"to"#DST_TYPE)
@ -88,15 +93,15 @@ __BEGIN_DECLS
 #define HASH_LN_AXIS01_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
    CVIVANTE_NAMESPACE("evis.layernorm_axis01_"#SRC0_TYPE"_"#SRC1_TYPE"to"#DST_TYPE)

-#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE) \
+#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
    {   HASH_LAYERNORM_KEY(IN0_TYPE, U4, OUT_TYPE, 0), \
        HASH_LN_AXIS01_SUMS_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
-        SOURCE_AXIS01 },
+        SOURCE },

-#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
    { HASH_LAYERNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
      HASH_LN_AXIS01_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
-      SOURCE_AXIS01 },
+      SOURCE },

 typedef struct
 {
@ -159,32 +164,32 @@ static const _kernel_map_type _layernorm_kernel_map[] =
 static const _kernel_map_type _layernorm_axis01_kernel_map[] =
 {
    // Register kernel here
-    LN_AXIS01_SUMS_KERNELS( I8,  F32 )
-    LN_AXIS01_SUMS_KERNELS( U8,  F32 )
-    LN_AXIS01_SUMS_KERNELS( F16, F32 )
-    LN_AXIS01_SUMS_KERNELS( I16, F32 )
+    LN_AXIS01_SUMS_KERNELS( I8,  F32, SOURCE_AXIS01_SUM )
+    LN_AXIS01_SUMS_KERNELS( U8,  F32, SOURCE_AXIS01_SUM )
+    LN_AXIS01_SUMS_KERNELS( F16, F32, SOURCE_AXIS01_SUM )
+    LN_AXIS01_SUMS_KERNELS( I16, F32, SOURCE_AXIS01_SUM )

-    LAYERNORM_AXIS01_KERNELS( U8,  F16, U8 )
-    LAYERNORM_AXIS01_KERNELS( U8,  F16, F16 )
-    LAYERNORM_AXIS01_KERNELS( I8,  F16, I8 )
-    LAYERNORM_AXIS01_KERNELS( I8,  F16, F16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F16, F16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F16, I16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F16, I8 )
-    LAYERNORM_AXIS01_KERNELS( F16, F16, U8 )
-    LAYERNORM_AXIS01_KERNELS( I16, F16, I16 )
-    LAYERNORM_AXIS01_KERNELS( I16, F16, F16 )
+    LAYERNORM_AXIS01_KERNELS( U8,  F16, U8,  SOURCE_AXIS01_0 )
+    LAYERNORM_AXIS01_KERNELS( U8,  F16, F16, SOURCE_AXIS01_0 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F16, I8,  SOURCE_AXIS01_1 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F16, F16, SOURCE_AXIS01_1 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, F16, SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, I16, SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, I8,  SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, U8,  SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( I16, F16, I16, SOURCE_AXIS01_3 )
+    LAYERNORM_AXIS01_KERNELS( I16, F16, F16, SOURCE_AXIS01_3 )

-    LAYERNORM_AXIS01_KERNELS( U8,  F32, U8 )
-    LAYERNORM_AXIS01_KERNELS( U8,  F32, F16 )
-    LAYERNORM_AXIS01_KERNELS( I8,  F32, I8 )
-    LAYERNORM_AXIS01_KERNELS( I8,  F32, F16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F32, F16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F32, I16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F32, I8 )
-    LAYERNORM_AXIS01_KERNELS( F16, F32, U8 )
-    LAYERNORM_AXIS01_KERNELS( I16, F32, I16 )
-    LAYERNORM_AXIS01_KERNELS( I16, F32, F16 )
+    LAYERNORM_AXIS01_KERNELS( U8,  F32, U8,  SOURCE_AXIS01_0 )
+    LAYERNORM_AXIS01_KERNELS( U8,  F32, F16, SOURCE_AXIS01_0 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F32, I8,  SOURCE_AXIS01_1 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F32, F16, SOURCE_AXIS01_1 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, F16, SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, I16, SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, I8,  SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, U8,  SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( I16, F32, I16, SOURCE_AXIS01_3 )
+    LAYERNORM_AXIS01_KERNELS( I16, F32, F16, SOURCE_AXIS01_3 )

 };

@ -1165,3 +1170,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_EVIS( layer_norm, _setup )
+#endif
--- a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
@ -22,6 +22,7 @@
 *
 *****************************************************************************/

+#if !(VX_LOGSOFTMAX_VX_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -34,15 +35,21 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"

 __BEGIN_DECLS

 #define HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))

+
 #define HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) \
    "log_softmax_axis"#_suffix

+ #define HASH_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_suffix) \
+    "log_softmax_exceed_axis"#_suffix
+
+
 #define HASH_LOG_SOFTMAX_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, _suffix) \
        { HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
        CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \
@ -53,11 +60,18 @@ __BEGIN_DECLS
        CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \
        HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) },

-static const struct {
+#define HASH_LOG_SOFTMAX_EXCEED_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, _suffix) \
+        { HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
+        CVIVANTE_NAMESPACE("evis.log_softmax_exceed_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \
+        HASH_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_suffix) },
+
+typedef struct {
        uint32_t key;
        char* function_name;
        const char* source_name;
-    } _log_softmax_evis_kernel_map[] =
+    } _kernel_map_type;
+
+static const _kernel_map_type _log_softmax_evis_kernel_map[] =
 {
    HASH_LOG_SOFTMAX_KERNELS(0, F16,  F16,  0)
    HASH_LOG_SOFTMAX_KERNELS(0, F16,  I16,  0)
@ -126,6 +140,49 @@ static const struct {

 };

+static const _kernel_map_type _log_softmax_exceed_evis_kernel_map[] =
+{
+
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16,  F16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16,  I16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16,  U8,   0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16,  I8,   0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I16,  I16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I16,  F16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, BF16, 0_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, F32,  0_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, F16,  0_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, U8,   U8,   0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, U8,   F16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I8,   I8,   0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I8,   F16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16,  F16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16,  I16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16,  U8,   1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16,  I8,   1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I16,  I16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I16,  F16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, BF16, 1_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, F32,  1_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, F16,  1_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, U8,   U8,   1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, U8,   F16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I8,   I8,   1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I8,   F16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16,  F16,  2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16,  I16,  2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16,  U8,   2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16,  I8,   2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I16,  I16,  2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I16,  F16,  2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, BF16, BF16, 2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, U8,   U8,   2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, U8,   F16,  2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I8,   I8,   2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I8,   F16,  2)
+
+};
+
 static vx_param_description_t kernel_param_def[] =
 {
    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@ -133,7 +190,9 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
-#define _EVIS_PARAM_NUM          _cnt_of_array(kernel_param_def)
+
+
+#define _EVIS_PARAM_NUM                  _cnt_of_array(kernel_param_def)

 #define SCALAR_INPUT_AXIS          (2)
 #define SCALAR_INPUT_BETA          (3)
@ -157,7 +216,7 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
    float       beta                        = 0;
    float      input_scale                  = 0;
    float      output_scale                 = 0;
-    int32_t    outputZP                     = 0;
+    float      outputZP                     = 0;
    uint32_t   inputWidth                   = 0;
    uint32_t   inputWidthRemain4            = 0;
    vsi_nn_kernel_tensor_attr_t * attr[2]   = { NULL, NULL };
@ -385,62 +444,25 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
        }
    }

+    outputZP = (float)attr[1]->zero_point;
+    output_scale = 1.0f / (float)(attr[1]->scale);
+
    if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
-        int32_t fl = attr[1]->dfp.fl;
-
-        if (fl > 0)
-        {
-            output_scale = (float)((int64_t)1 << fl);
-        }
-        else
-        {
-            output_scale = (float)1.0f / (float) ((int64_t)1 << -fl);
-        }
-
        status = vsi_nn_kernel_gpu_add_param( node,
            "outputScale", &output_scale );
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
-        float output_offset_asymmetric = 0;
-        outputZP = attr[1]->asymm.zero_point;
-        output_scale = 1.0f / (float)(attr[1]->asymm.scale);
-        output_offset_asymmetric = (float)outputZP;
-
        status = vsi_nn_kernel_gpu_add_param( node,
            "outputScale", &output_scale );
        status |= vsi_nn_kernel_gpu_add_param( node,
-            "output_offset_asymmetric", &output_offset_asymmetric );
+            "output_offset_asymmetric", &outputZP );
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
-    else
-    {
-        output_scale = 1;
-        outputZP     = 0;
-    }

-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input_scale = attr[0]->asymm.scale;
-    }
-    else
-    {
-        input_scale = 1.0f;
-    }
+    input_scale = attr[0]->scale;

    scaleLogE = scaleLogE * input_scale;
    beta = beta * input_scale;
@ -471,6 +493,296 @@ final:
    return status;
 } /* _log_softmax_initializer() */

+DEF_KERNEL_INITIALIZER(_log_softmax_exceed_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    int32_t     axis                        = 0;
+    float       beta                        = 0;
+    float      input_scale                  = 0;
+    float      output_scale                 = 0;
+    float      outputZP                     = 0;
+    uint32_t   inputWidth                   = 0;
+    uint32_t   inputWidthRemain4            = 0;
+    int32_t    width                        = 0;
+    int32_t    height                       = 0;
+    int32_t    depth                        = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[2]   = { NULL, NULL };
+    vsi_size_array_t * output_shape          = NULL;
+    float   logE                            = (float)(log10(exp(1.0f)) / log10(2.0f));
+    float   rlogE                           = (float)(log10(2.0f) / log10(exp(1.0f)));
+    float   scaleLogE                       = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &beta);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    scaleLogE = logE * beta;
+
+    output_shape  = attr[1]->shape;
+    width = (int32_t)output_shape->data[0];
+    height = (int32_t)output_shape->data[1];
+    depth = output_shape->size > 2 ? (int32_t)output_shape->data[2] : 1;
+    gpu_param.dim = 2;
+    switch (axis)
+    {
+        case 0:
+            gpu_param.global_scale[0] = 1;
+            gpu_param.global_scale[1] = 1;
+            gpu_param.global_size[0]  = 1;
+            gpu_param.global_size[1]  = depth;
+        break;
+        case 1:
+            gpu_param.global_scale[0] = 8;
+            gpu_param.global_scale[1] = 1;
+            gpu_param.global_size[0]  =
+            gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+            gpu_param.global_size[1]  = 1;
+        break;
+        default:
+        break;
+    }
+
+    {
+        gpu_dp_inst_t uniGetSubData0to3_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniGetSubData4to7_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniPackMaxData_2x8 = {{
+            0x00000111, // TCfg
+            0x00000000, // ASelt
+            0x00050300, 0x00000000, // ABin
+            0x00000222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00004400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractHalf4_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniGetSubLoData_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00110000, 0x00330022, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniGetSubHiData_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00550044, 0x00770066, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        switch( axis )
+        {
+            case 0:
+            {
+                inputWidth        = (uint32_t)(output_shape->data[axis] / 4 * 4);
+                inputWidthRemain4 = (uint32_t)(output_shape->data[axis] % 4);
+
+                status = vsi_nn_kernel_gpu_add_param( node,
+                        "inputWidth", &inputWidth );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "inputWidthRemain4", &inputWidthRemain4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniPackMaxData_2x8", &uniPackMaxData_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node, "axisSize", &width );
+                status |= vsi_nn_kernel_gpu_add_param( node, "height", &height);
+                if (attr[0]->dtype == BF16)
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractHalf4_4x4", &uniExtractHalf4_4x4 );
+                }
+                else
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniGetSubData0to3_4x4", &uniGetSubData0to3_4x4 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniGetSubData4to7_4x4", &uniGetSubData4to7_4x4 );
+                }
+                CHECK_STATUS_FAIL_GOTO(status, final );
+            }
+            break;
+        case 1:
+            {
+                if (attr[0]->dtype == BF16)
+                {
+                    status = vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
+                }
+                else
+                {
+                    status = vsi_nn_kernel_gpu_add_param( node,
+                            "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniGetSubLoData_4x4", &uniGetSubLoData_4x4 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniGetSubHiData_4x4", &uniGetSubHiData_4x4 );
+                }
+                status |= vsi_nn_kernel_gpu_add_param( node, "axisSize", &height );
+                status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
+                CHECK_STATUS_FAIL_GOTO(status, final );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+
+    outputZP = (float)attr[1]->zero_point;
+    output_scale = 1.0f / attr[1]->scale;
+
+    if (attr[0]->dtype != BF16)
+    {
+        status = vsi_nn_kernel_gpu_add_param( node,
+            "outputScale", &output_scale );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "output_offset_asymmetric", &outputZP );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    input_scale = attr[0]->scale;
+
+    scaleLogE = scaleLogE * input_scale;
+    beta = beta * input_scale;
+
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "rlogE", &rlogE );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "betaValue", &beta );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "scaleLogE", &scaleLogE );
+
+    status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+    }
+
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+    }
+
+    return status;
+
+}
+
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -513,7 +825,51 @@ static vsi_status _query_kernel
    return status;
 } /* _query_kernel() */

-static vsi_nn_kernel_node_t _setup
+static vsi_status _query_kernel_exceed
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    int32_t axis,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    vsi_nn_kernel_dtype_e input_dtype;
+    vsi_nn_kernel_dtype_e output_dtype;
+    vsi_status status = VSI_FAILURE;
+    uint32_t key;
+    size_t i;
+
+    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_LOG_SOFTMAX_HASH_KEY( axis, input_dtype, output_dtype, 0 );
+
+    for( i = 0; i < _cnt_of_array(_log_softmax_exceed_evis_kernel_map); i ++ )
+    {
+        if( _log_softmax_exceed_evis_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if( i < _cnt_of_array(_log_softmax_exceed_evis_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _log_softmax_exceed_evis_kernel_map[i].function_name );
+        kernel->info.parameters = kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( kernel_param_def );
+        kernel->info.initialize = _log_softmax_exceed_initializer;
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                _log_softmax_exceed_evis_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _log_softmax_exceed_evis_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+}
+
+static vsi_nn_kernel_node_t _setup_not_exceed
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
@ -528,7 +884,13 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL};
    vsi_bool image_2d = FALSE;
    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    uint32_t rank_in = 0;
    int32_t axis = 0;
+    int32_t new_axis = 0;
+    vsi_bool ret = vx_false_e;
+    uint32_t i   = 0;
    float beta = 1.0f;

    VSI_UNREFERENCED(input_num);
@ -537,15 +899,31 @@ static vsi_nn_kernel_node_t _setup
    axis = vsi_nn_kernel_param_get_int32(params, "axis");
    beta = vsi_nn_kernel_param_get_float32(params, "beta");

-    if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
-                inputs[0]->attr.dim_num )
-     || axis > 2)
+    ret = vsi_nn_kernel_optimize_softmax_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+            shapes[0], &rank_in, &new_axis);
+
+    if (ret)
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], rank_in );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[0], rank_in );
+    }
+    else
    {
        return NULL;
    }

-    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
-    status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
+    if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
+                reshape_tensors[0]->attr.dim_num )
+     || new_axis > 2)
+    {
+        return NULL;
+    }
+
+    image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1);
+    status = _query_kernel( inputs, outputs, new_axis, image_2d, kernel );
    if( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
@ -553,9 +931,9 @@ static vsi_nn_kernel_node_t _setup
        {
            /* Pass parameters to node. */
            vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM,
-                    inputs, 1, outputs, 1 );
+                    reshape_tensors, 1, &reshape_tensors[1], 1 );
            node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
+                    graph, I32, &new_axis );
            node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create(
                    graph, F32, &beta );

@ -565,10 +943,132 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_BETA] );
        }
    }
+
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
    return node;
 } /* _setup() */

+static vsi_nn_kernel_node_t _setup_exceed
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    uint32_t rank_in = 0;
+    int32_t axis = 0;
+    int32_t new_axis = 0;
+    vsi_bool ret = vx_false_e;
+    uint32_t i   = 0;
+    float beta = 1.0f;
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    beta = vsi_nn_kernel_param_get_float32(params, "beta");
+
+    ret = vsi_nn_kernel_optimize_softmax_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+            shapes[0], &rank_in, &new_axis);
+
+    if (ret)
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], rank_in );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[0], rank_in );
+    }
+    else
+    {
+        return NULL;
+    }
+
+    if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
+                reshape_tensors[0]->attr.dim_num )
+     || new_axis > 1)
+    {
+        return NULL;
+    }
+
+    status = _query_kernel_exceed(inputs, outputs, new_axis, kernel);
+    if( VSI_SUCCESS != status)
+    {
+        goto final;
+    }
+
+    node = vsi_nn_kernel_create_node( graph, kernel );
+    CHECK_PTR_FAIL_GOTO( node, "Create kernel fail.", final );
+    if (node)
+    {
+        vsi_nn_kernel_node_pack_io(node_params, _EVIS_PARAM_NUM,
+                                   reshape_tensors,
+                                   input_num,
+                                   &reshape_tensors[1],
+                                   output_num);
+        node_params[2] = vsi_nn_kernel_scalar_create(graph, I32, &new_axis );
+        node_params[3] = vsi_nn_kernel_scalar_create(graph, F32, &beta );
+
+        status = vsi_nn_kernel_node_pass_param(
+            node, node_params, _EVIS_PARAM_NUM);
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &node_params[2] );
+        vsi_nn_kernel_scalar_release( &node_params[3] );
+    }
+
+final:
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
+    return node;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t *input_size = inputs[0]->attr.size;
+    int32_t axis = 0;
+    axis = vsi_nn_kernel_param_get_int32(params, "axis");
+
+    if (input_size[axis] >= GPU_TENSOR_MAX_WIDTH)
+    {
+        node = _setup_exceed(graph, inputs, input_num, outputs, output_num, params, kernel);
+    }
+    else
+    {
+        node = _setup_not_exceed(graph, inputs, input_num, outputs, output_num, params, kernel);
+    }
+
+    return node;
+}
+
+
 __END_DECLS

 REGISTER_BACKEND_EVIS( log_softmax, _setup )
-
+#endif
--- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
@ -996,18 +996,14 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
    float                        forget_bias            = 0.0f;
    float                        outputScale            = 1.0f;
    float                        outputZP               = 0;
-    int32_t                      dstZP                  = 0;
-    float                        dstScale               = 1.0f;
    vsi_nn_kernel_dtype_e        cellFormat             = F16;
    vsi_nn_kernel_dtype_e        dstFormat              = F16;
-    vsi_nn_kernel_quant_type_e   dstQuantType           = VSI_NN_KERNEL_QUANT_NONE;
-    int32_t                      dstFixPointPos         = 0;
-    float                        logE                   = (vx_float32)(log10(exp(1.0f)) / log10(2.0f));
+    float                        logE                   = (float)(log10(exp(1.0f)) / log10(2.0f));
    float                        twoLogE                = 2 * logE;
    uint32_t                     uint_min               = 0xFBFFFFFF;
    uint32_t                     uint_max               = 0x7BFFFFFF;
-    float                        float_min              = *(vx_float32 *)&uint_min;
-    float                        float_max              = *(vx_float32 *)&uint_max;
+    float                        float_min              = *(float *)&uint_min;
+    float                        float_max              = *(float *)&uint_max;
    float                        clip_Min_F[4]          = {0};
    float                        clip_Max_F[4]          = {0};
    uint32_t                     i                      = 0;
@ -1063,22 +1059,11 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
    status = vsi_nn_kernel_scalar_read_float32( (vsi_nn_kernel_scalar_t)param[param_size - 1], &forget_bias );
    CHECK_STATUS_FAIL_GOTO(status, final );

-    cellFormat = attr[0]->dtype;
-    dstFormat   = attr[1]->dtype;
+    cellFormat   = attr[0]->dtype;
+    dstFormat    = attr[1]->dtype;

-    dstQuantType = attr[1]->quant;
-
-    if ( VSI_NN_KERNEL_QUANT_DFP == dstQuantType )
-    {
-        dstFixPointPos = (int8_t)attr[1]->dfp.fl;
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == dstQuantType )
-    {
-        dstZP = attr[1]->asymm.zero_point;
-        dstScale = attr[1]->asymm.scale;
-    }
-
-    outputZP  = (vx_float32)dstZP;
+    outputScale  = 1.0f / attr[1]->scale;
+    outputZP     = (float)attr[1]->zero_point;

    gpu_param.global_scale[0]  = 4;
    gpu_param.global_scale[1]  = 1;
@ -1182,20 +1167,6 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16};

-        if (dstQuantType == VSI_NN_KERNEL_QUANT_DFP)
-        {
-            if (dstFixPointPos >= 0)
-                outputScale *= (vx_float32)((int64_t)1 << dstFixPointPos);
-            else if (dstFixPointPos < 0)
-                outputScale *= 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
-
-            outputZP = 0;
-        }
-        else if (dstQuantType == VSI_NN_KERNEL_QUANT_ASYMM)
-        {
-            outputScale = 1.0f / dstScale;
-        }
-
        if ( cellFormat == F16 )
        {
            status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4);
--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@ -288,67 +288,13 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &K);
    CHECK_STATUS_FAIL_GOTO(status, OnError );

-    src0ZP     = attr[0]->asymm.zero_point;
-    src0Scale  = attr[0]->asymm.scale;
-    src1ZP     = attr[1]->asymm.zero_point;
-    src1Scale  = attr[1]->asymm.scale;
-    dstZP      = (float)attr[2]->asymm.zero_point;
-    dstScale   = attr[2]->asymm.scale;
+    src0ZP     = attr[0]->zero_point;
+    src0Scale  = attr[0]->scale;
+    src1ZP     = attr[1]->zero_point;
+    src1Scale  = attr[1]->scale;
+    dstZP      = (float)attr[2]->zero_point;
+    dstScale   = attr[2]->scale;

-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        src0ZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        src0Scale = 1;
-        src0ZP = 0;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
-        }
-        else
-        {
-            src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
-        }
-        src1ZP = 0;
-    }
-    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        src1Scale = 1;
-        src1ZP = 0;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-        dstScale = 1.0f / dstScale;
-        dstZP = 0.0f;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
-        dstZP = 0.0f;
-    }
    gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postShift0);
    gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postShift1);

@ -1266,67 +1212,12 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_cross_initializer)
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &axis_size);
    CHECK_STATUS_FAIL_GOTO(status, OnError );

-    src0ZP     = attr[0]->asymm.zero_point;
-    src0Scale  = attr[0]->asymm.scale;
-    src1ZP     = attr[1]->asymm.zero_point;
-    src1Scale  = attr[1]->asymm.scale;
-    dstZP      = (float)attr[2]->asymm.zero_point;
-    dstScale   = attr[2]->asymm.scale;
-
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        src0ZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        src0Scale = 1;
-        src0ZP = 0;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
-        }
-        else
-        {
-            src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
-        }
-        src1ZP = 0;
-    }
-    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        src1Scale = 1;
-        src1ZP = 0;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-        dstScale = 1.0f / dstScale;
-        dstZP = 0.0f;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
-        dstZP = 0.0f;
-    }
+    src0ZP     = attr[0]->zero_point;
+    src0Scale  = attr[0]->scale;
+    src1ZP     = attr[1]->zero_point;
+    src1Scale  = attr[1]->scale;
+    dstZP      = (float)attr[2]->zero_point;
+    dstScale   = attr[2]->scale;

    mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP);
    inOutScale =  src0Scale * src1Scale / dstScale;
--- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
@ -163,63 +163,12 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );

    out_shape  = attr[2]->shape;
-
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input0_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input0_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        input0_zp     = attr[0]->asymm.zero_point;
-        input0_scale  = attr[0]->asymm.scale;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            input1_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input1_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        input1_zp     = attr[1]->asymm.zero_point;
-        input1_scale  = attr[1]->asymm.scale;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[2]->dfp.fl;
-        if (fl > 0)
-        {
-            output_scale = (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            output_scale = 1.0f / (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        output_zp     = attr[2]->asymm.zero_point;
-        output_scale  = 1.0f / attr[2]->asymm.scale;
-    }
+    input0_zp    = attr[0]->zero_point;
+    input0_scale = attr[0]->scale;
+    input1_zp    = attr[1]->zero_point;
+    input1_scale = attr[1]->scale;
+    output_zp    = attr[2]->zero_point;
+    output_scale = 1.0f / attr[2]->scale;

 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
        (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
@ -454,30 +403,52 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_tensor_t* tmp_inputs[2] = { NULL };
    vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
    vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;

    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    VSI_UNREFERENCED(params);

-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if (ret == FALSE)
    {
-        return NULL;
+        goto final;
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            inputs[1], shapes[1], new_rank );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shapes[2], new_rank );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
    }

    // Reorder tensor
    if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 )
    {
        int32_t order[2] = {1, 0};
-        vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs );
+        vsi_nn_reorder_tensor( reshape_tensors, order, 2, tmp_inputs );
    }
    else
    {
-        memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 );
+        memmove( tmp_inputs, reshape_tensors, sizeof(vsi_nn_tensor_t*) * 2 );
    }

-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( tmp_inputs, outputs, image_2d, kernel );
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2);
+    status = _query_kernel( tmp_inputs, &reshape_tensors[2], image_2d, kernel );
    if ( VSI_SUCCESS == status )
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
@ -485,10 +456,16 @@ static vsi_nn_kernel_node_t _setup
        {
            /* Pass parameters to node. */
            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
-                    tmp_inputs, 2, outputs, 1 );
+                    tmp_inputs, 2, &reshape_tensors[2], 1 );
            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
        }
    }
+
+final:
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(reshape_tensors[2]);
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
@ -163,63 +163,12 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );

    out_shape  = attr[2]->shape;
-
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input0_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input0_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        input0_zp     = attr[0]->asymm.zero_point;
-        input0_scale  = attr[0]->asymm.scale;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            input1_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input1_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        input1_zp     = attr[1]->asymm.zero_point;
-        input1_scale  = attr[1]->asymm.scale;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[2]->dfp.fl;
-        if (fl > 0)
-        {
-            output_scale = (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            output_scale = 1.0f / (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        output_zp     = attr[2]->asymm.zero_point;
-        output_scale  = 1.0f / attr[2]->asymm.scale;
-    }
+    input0_zp     = attr[0]->zero_point;
+    input0_scale  = attr[0]->scale;
+    input1_zp     = attr[1]->zero_point;
+    input1_scale  = attr[1]->scale;
+    output_zp     = attr[2]->zero_point;
+    output_scale  = 1.0f / attr[2]->scale;

 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
        (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
@ -454,30 +403,52 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_tensor_t* tmp_inputs[2] = { NULL };
    vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
    vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;

    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    VSI_UNREFERENCED(params);

-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if (ret == FALSE)
    {
-        return NULL;
+        goto final;
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            inputs[1], shapes[1], new_rank );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shapes[2], new_rank );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
    }

    // Reorder tensor
    if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 )
    {
        int32_t order[2] = {1, 0};
-        vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs );
+        vsi_nn_reorder_tensor( reshape_tensors, order, 2, tmp_inputs );
    }
    else
    {
-        memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 );
+        memmove( tmp_inputs, reshape_tensors, sizeof(vsi_nn_tensor_t*) * 2 );
    }

-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( tmp_inputs, outputs, image_2d, kernel );
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2);
+    status = _query_kernel( tmp_inputs, &reshape_tensors[2], image_2d, kernel );
    if ( VSI_SUCCESS == status )
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
@ -485,10 +456,16 @@ static vsi_nn_kernel_node_t _setup
        {
            /* Pass parameters to node. */
            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
-                    tmp_inputs, 2, outputs, 1 );
+                    tmp_inputs, 2, &reshape_tensors[2], 1 );
            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
        }
    }
+
+final:
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(reshape_tensors[2]);
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/evis/mod_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/mod_evis.c
@ -128,9 +128,6 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
    vsi_size_array_t             *output_shape = NULL;
    vsi_nn_kernel_dtype_e        input0_dtype = F16;
-    int32_t                      input0_fl    = 0;
-    int32_t                      input1_fl    = 0;
-    int32_t                      output_fl    = 0;
    float                        inScale0     = 1.0f;
    float                        inScale1     = 1.0f;
    float                        outScale     = 1.0f;
@ -168,59 +165,12 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
                                 (output_shape->data[2] + gpu_param.global_scale[2] - 1)
                                             / gpu_param.global_scale[2] : 1;

-    if (input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        input0_fl = input0_attr->dfp.fl;
-        if (input0_fl > 0)
-        {
-            inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
-        }
-        else
-        {
-            inScale0 = (float)((int64_t)1 << -input0_fl);
-        }
-    }
-    else if (input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        inScale0   = input0_attr->asymm.scale;
-        in0Tail    = -inScale0 * ((float)input0_attr->asymm.zero_point);
-    }
-
-    if (input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        input1_fl = input1_attr->dfp.fl;
-        if (input1_fl > 0)
-        {
-            inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
-        }
-        else
-        {
-            inScale1 = (float)((int64_t)1 << -input1_fl);
-        }
-    }
-    else if (input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        inScale1   = input1_attr->asymm.scale;
-        in1Tail    = -inScale1 * ((float)input1_attr->asymm.zero_point);
-    }
-
-    if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outScale = 1.0f / (float)((int64_t)1 << -output_fl);
-        }
-    }
-    else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        outScale    = 1.0f / output_attr->asymm.scale;
-        outZp       = (float)(output_attr->asymm.zero_point);
-    }
+    inScale0   = input0_attr->scale;
+    in0Tail    = 0 - inScale0 * ((float)input0_attr->zero_point);
+    inScale1   = input1_attr->scale;
+    in1Tail    = 0 - inScale1 * ((float)input1_attr->zero_point);
+    outScale   = 1.0f / output_attr->scale;
+    outZp      = (float)(output_attr->zero_point);

    if (BF16 == input0_dtype)
    {
--- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
@ -239,76 +239,12 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    input_shape  = attr[0]->shape;
-
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        input_zp = attr[0]->asymm.zero_point;
-        scaleIn  = attr[0]->asymm.scale;
-    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-
-        input_zp = 0;
-    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        input_zp = 0;
-        scaleIn = 1;
-    }
-
-    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_ZP0     = (float)attr[1]->asymm.zero_point;
-        outputScale0   = 1.0f / attr[1]->asymm.scale;
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            outputScale0 = (float)((int64_t)1 << attr[1]->dfp.fl);
-        }
-        else
-        {
-            outputScale0 = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
-        }
-        output_ZP0 = 0.0f;
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale0 = 1.0f;
-        output_ZP0 = 0.0f;
-    }
-
-    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_ZP1     = (float)attr[2]->asymm.zero_point;
-        outputScale1   = 1.0f / attr[2]->asymm.scale;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            outputScale1 = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            outputScale1 = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-        output_ZP1 = 0.0f;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale1 = 1.0f;
-        output_ZP1 = 0.0f;
-    }
+    input_zp     = attr[0]->zero_point;
+    scaleIn      = attr[0]->scale;
+    output_ZP0   = (float)attr[1]->zero_point;
+    outputScale0 = 1.0f / attr[1]->scale;
+    output_ZP1   = (float)attr[2]->zero_point;
+    outputScale1 = 1.0f / attr[2]->scale;

    output_ZP[0] = output_ZP0;
    output_ZP[1] = output_ZP1;
--- a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
@ -160,16 +160,13 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
    in_shape = attr[0]->shape;
    depth = (int32_t)(attr[1]->shape->data[1]);
    input_dtype  = attr[0]->dtype;
+    input_zp = attr[0]->zero_point;
+    scaleIn  = attr[0]->scale;

    if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant)
    {
        srcFixPointPos = attr[0]->dfp.fl;
    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant)
-    {
-        input_zp = attr[0]->asymm.zero_point;
-        scaleIn  = attr[0]->asymm.scale;
-    }

    if (suffix_size == 1)
    {
--- a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
@ -155,41 +155,19 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer)
    input_shape   = input_attr->shape;
    src_dtype     = input_attr->dtype;
    dst_dtype     = output_attr->dtype;
+    inputScale    = input_attr->scale;
+    input_ZP      = input_attr->zero_point;
+    outputScale   = output_attr->scale;
+    output_ZP     = output_attr->zero_point;

    if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        input_fl = input_attr->dfp.fl;
-        if (input_fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -input_fl);
-        }
-    }
-    else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale   = input_attr->asymm.scale;
-        input_ZP     = input_attr->asymm.zero_point;
    }

    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outputScale = 1.0f / (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = (float)((int64_t)1 << -output_fl);
-        }
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale  = output_attr->asymm.scale;
-        output_ZP    = output_attr->asymm.zero_point;
    }

    if ( ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
--- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
@ -22,6 +22,7 @@
 *
 *****************************************************************************/

+#if !(VX_TENSOR_POW_API_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -158,64 +159,13 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );

-    out_shape   = attr[2]->shape;
-
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input0_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input0_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        input0_scale  = attr[0]->asymm.scale;
-        input0_tail = 0 - (float)attr[0]->asymm.zero_point * input0_scale;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            input1_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input1_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        input1_scale  = attr[1]->asymm.scale;
-        input1_tail = 0 - (float)attr[1]->asymm.zero_point * input1_scale;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[2]->dfp.fl;
-        if (fl > 0)
-        {
-            output_scale = (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            output_scale = 1.0f / (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        output_zp     = (float)attr[2]->asymm.zero_point;
-        output_scale  = 1.0f / attr[2]->asymm.scale;
-    }
+    out_shape    = attr[2]->shape;
+    input0_scale = attr[0]->scale;
+    input0_tail  = 0 - (float)attr[0]->zero_point * input0_scale;
+    input1_scale = attr[1]->scale;
+    input1_tail  = 0 - (float)attr[1]->zero_point * input1_scale;
+    output_zp    = (float)attr[2]->zero_point;
+    output_scale = 1.0f / attr[2]->scale;

 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
        (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
@ -454,3 +404,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_EVIS( pow, _setup )
+#endif
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
@ -140,28 +140,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
    }
    enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));

-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        outputScale = 1.0f / attr[0]->asymm.scale;
-        dstZP = attr[0]->asymm.zero_point;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale = 1;
-        dstZP = 0;
-    }
+    outputScale = 1.0f / attr[0]->scale;
+    dstZP = attr[0]->zero_point;

    shaderParam.global_scale[0]  = 4;
    shaderParam.global_scale[1]  = 1;
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
@ -133,28 +133,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
    width      = (uint32_t)(out_shape->data[0]);
    height     = (uint32_t)(out_shape->data[1]);

-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0.0f;
-    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        outputScale = 1.0f / attr[0]->asymm.scale;
-        dstZP = (float)attr[0]->asymm.zero_point;
-    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale = 1;
-        dstZP = 0.0f;
-    }
+    outputScale = 1.0f / attr[0]->scale;
+    dstZP = (float)attr[0]->zero_point;

    shaderParam.global_scale[0]  = 16;
    shaderParam.global_scale[1]  = 1;
@ -232,33 +212,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_initializer)
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );

    out_shape  = attr[0]->shape;
-    dstZP      = (float)attr[0]->asymm.zero_point;
-    outputScale   = attr[0]->asymm.scale;
+    dstZP      = (float)attr[0]->zero_point;
+    outputScale = 1.0f / attr[0]->scale;
    width      = (uint32_t)(out_shape->data[0]);
    height     = (uint32_t)(out_shape->data[1]);

-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0.0f;
-    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        outputScale = 1.0f/outputScale;
-    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale = 1;
-        dstZP = 0.0f;
-    }
-
    shaderParam.global_scale[0]  = 4;
    shaderParam.global_scale[1]  = 1;
    shaderParam.global_scale[2]  = 1;
@ -499,8 +457,8 @@ OnError:
    }
    if (attr[1])
    {
-        vsi_nn_kernel_tensor_attr_release( &attr[0] );
-        attr[0] = NULL;
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
    }

    return status;
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_rggb_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_rggb_evis.c
@ -0,0 +1,884 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOF16 \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toF16")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI16 \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI16")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOU8  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI8  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI8")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOU8  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toU8")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOI8  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toI8")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOI16  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toI16")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOF16  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toF16")
+
+// greater than a quarter
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOU8_GQ  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8_gq")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI8_GQ  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8_gq")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOF16_GQ \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toF16_gq")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI16_GQ \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI16_gq")
+
+#define KERNEL_SOURCE_1    "pre_process_nv12_rggb_copy",
+#define KERNEL_SOURCE_2    "pre_process_nv12_rggb_scale",
+
+typedef enum
+{
+    COPY = 0,
+    SCALE,
+    TRANS
+} vsi_nn_kernel_convert_type_e;
+
+#define HASH_PRE_PROCESS_NV12_RGGB_KEY(_input0_type, _output_type, _convert_type, _greater_quarter) \
+    ((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8) | (_greater_quarter))
+
+#define TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \
+    { HASH_PRE_PROCESS_NV12_RGGB_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 0), \
+        VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
+#define TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \
+    { HASH_PRE_PROCESS_NV12_RGGB_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 1), \
+        VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE##_GQ, \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } pre_process_nv12_rggb_map[] =
+{
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I8,  COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I16, COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, F16, COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, F16, SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I16, SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, U8,  SCALE,     KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, F16, SCALE,     KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, I8,  SCALE,     KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, I16, SCALE,     KERNEL_SOURCE_2)
+};
+
+static vx_param_description_t vxPreProcessNv12_RGGBKernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM          _cnt_of_array(vxPreProcessNv12_RGGBKernel_param_def)
+
+static vsi_bool _check_nv12_type_from_env()
+{
+    vsi_bool ret = FALSE;
+    char* env_s = vsi_nn_getenv("VSI_NN_ENABLE_OCV_NV12");
+    if (env_s)
+    {
+        ret = TRUE;
+    }
+    return ret;
+}
+
+DEF_KERNEL_INITIALIZER(_pre_process_nv12_rggb_copy_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    float       output_zp      = 0;
+    float       output_scale   = 1;
+    int32_t     reorder    = 0;
+    int32_t     order1     = 3;
+    uint32_t    width      = 0;
+    uint32_t    height     = 0;
+    int32_t     nv_type    = 0;
+    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+    float       b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+    float       outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
+    float       bMeanScaleVarZp = 0.0f,  gMeanScaleVarZp = 0.0f,  rMeanScaleVarZp = 0.0f;
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+    vsi_bool ocv_nv12 = _check_nv12_type_from_env();
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &nv_type);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    out_shape  = attr[0]->shape;
+    output_scale = 1.0f / attr[0]->scale;
+    output_zp = (float)attr[0]->zero_point;
+    width      = (uint32_t)(out_shape->data[0]);
+    height     = (uint32_t)(out_shape->data[1]);
+
+    if (reorder != 0)
+    {
+        reorder = 3;
+        order1 = 0;
+    }
+
+    if (nv_type == VSI_NN_YUV_TYPE_NV21_BGGR)
+    {
+        int32_t tmporder = reorder;
+        reorder = order1;
+        order1 = tmporder;
+    }
+
+    outputScaleVar_b = output_scale * b_scale;
+    outputScaleVar_g = output_scale * g_scale;
+    outputScaleVar_r = output_scale * r_scale;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
+
+    shaderParam.global_scale[0]  = 4;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
+        / shaderParam.global_scale[1], 2);
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertNV12toB_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00210000, 0x00630042, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000,
+                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertNV12toG_4x4 = {{
+                0x29292929, // TCfg
+                0x14141414, // ASelt
+                0x03210100, 0x07630542, // ABin
+                0x2a2a2a2a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertNV12toR_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00310010, 0x00730052, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractYtoShortSub16_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
+            0x99999999, // TCfg
+            0x44444444, // ASelt
+            0x01000100, 0x03020302, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001,
+            0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
+            0x01010101,  // TCfg
+            0x00000000,  // ASelt
+            0x00010000,
+            0x00030002,  // ABin
+            0x02020202,  // BSelt
+            0x00000000,
+            0x00000000,  // BBin
+            0x00000400,  // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000  // Constant
+        }, GPU_DP_TYPE_16 };
+
+        if (ocv_nv12)
+        {
+            uniConvertNV12toB_4x4.data[2] = 0x00010000;
+            uniConvertNV12toB_4x4.data[3] = 0x00230022;
+            uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
+
+            uniConvertNV12toG_4x4.data[2] = 0x01010100;
+            uniConvertNV12toG_4x4.data[3] = 0x03230322;
+            uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[9] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[11] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[13] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[15] = 0x00003a81;
+
+            uniConvertNV12toR_4x4.data[2] = 0x00110010;
+            uniConvertNV12toR_4x4.data[3] = 0x00330032;
+            uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
+
+            uniExtractUVtoCharSub128_2x8.data[2] = 0x03020100;
+            uniExtractUVtoCharSub128_2x8.data[3] = 0x07060504;
+
+            uniExtractYtoShortSub16_2x8.data[0] = 0x99999999;
+            uniExtractYtoShortSub16_2x8.data[1] = 0x44444444;
+            uniExtractYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
+            uniExtractYtoShortSub16_2x8.data[8] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[9] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[10] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[11] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[12] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[13] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[14] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[15] = 0x00010001;
+        }
+
+        status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_2x8", &uniExtractYtoShortSub16_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+        switch( attr[0]->dtype )
+        {
+        case U8:
+        case I8:
+        case I16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case F16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _pre_process_nv12_rggb_copy_initializer() */
+
+DEF_KERNEL_INITIALIZER(_pre_process_nv12_rggb_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    float       output_zp      = 0;
+    float       output_scale   = 1;
+    int32_t     reorder    = 0;
+    int32_t     order1     = 3;
+    uint32_t    width      = 0;
+    uint32_t    height     = 0;
+    uint32_t    roi_width  = 0;
+    uint32_t    roi_height = 0;
+    uint32_t xrIntFloat_16 = 0;
+    uint32_t yrIntFloat_16 = 0;
+    int32_t     xRatio     = 0;
+    int32_t     yRatio     = 0;
+    int32_t     nv_type    = 0;
+    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+    float       b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+    float       outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
+    float       bMeanScaleVarZp = 0.0f,  gMeanScaleVarZp = 0.0f,  rMeanScaleVarZp = 0.0f;
+    float       resize     = 0.0f;
+    vsi_bool ocv_nv12 = _check_nv12_type_from_env();
+
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &xRatio);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &yRatio);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &nv_type);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    out_shape  = attr[1]->shape;
+    output_scale = 1.0f / attr[1]->scale;
+    output_zp = (float)attr[1]->zero_point;
+    width      = (uint32_t)(out_shape->data[0]);
+    height     = (uint32_t)(out_shape->data[1]);
+
+    if (reorder != 0)
+    {
+        reorder = 3;
+        order1 = 0;
+    }
+
+    if (nv_type == VSI_NN_YUV_TYPE_NV21_BGGR)
+    {
+        int32_t tmporder = reorder;
+        reorder = order1;
+        order1 = tmporder;
+    }
+
+    roi_width = (xRatio * width) >> 15;
+    roi_height = (yRatio * height) >> 15;
+    resize = (float)width / roi_width;
+    xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
+    yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
+
+    outputScaleVar_b = output_scale * b_scale;
+    outputScaleVar_g = output_scale * g_scale;
+    outputScaleVar_r = output_scale * r_scale;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
+
+    shaderParam.global_scale[0]  = 4;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
+        / shaderParam.global_scale[1], 2);
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvertNV12toB_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00210000, 0x00630042, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000,
+                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertNV12toG_4x4 = {{
+                0x29292929, // TCfg
+                0x14141414, // ASelt
+                0x03210100, 0x07630542, // ABin
+                0x2a2a2a2a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertNV12toR_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00310010, 0x00730052, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertYtoShortSub16_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertUVtoCharSub128_2x8 = {{
+            0x99999999, // TCfg
+            0x44444444, // ASelt
+            0x03020100, 0x07060504, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001,
+            0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        //trans
+        gpu_dp_inst_t uniCalculateYShift_2x8 = {{
+            0x00009999, // TCfg
+            0x00000000, // ASelt
+            0x06040200, 0x00000000, // ABin
+            0x00005555, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniCalculateUVShift_2x8 = {{
+            0x51515151, // TCfg
+            0x40404040, // ASelt
+            0x02020000, 0x06060404, // ABin
+            0x91919191, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00010000, 0x00000000, 0x00010000,
+            0x00000000, 0x00010000, 0x00000000, 0x00010000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
+            0x01010101,  // TCfg
+            0x00000000,  // ASelt
+            0x00010000,
+            0x00030002,  // ABin
+            0x02020202,  // BSelt
+            0x00000000,
+            0x00000000,  // BBin
+            0x00000400,  // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000  // Constant
+        }, GPU_DP_TYPE_16 };
+
+        if (ocv_nv12)
+        {
+            uniConvertNV12toB_4x4.data[2] = 0x00010000;
+            uniConvertNV12toB_4x4.data[3] = 0x00230022;
+            uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
+
+            uniConvertNV12toG_4x4.data[2] = 0x01010100;
+            uniConvertNV12toG_4x4.data[3] = 0x03230322;
+            uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[9] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[11] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[13] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[15] = 0x00003a81;
+
+            uniConvertNV12toR_4x4.data[2] = 0x00110010;
+            uniConvertNV12toR_4x4.data[3] = 0x00330032;
+            uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
+
+            uniConvertYtoShortSub16_2x8.data[0] = 0x99999999;
+            uniConvertYtoShortSub16_2x8.data[1] = 0x44444444;
+            uniConvertYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
+            uniConvertYtoShortSub16_2x8.data[8] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[9] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[10] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[11] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[12] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[13] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[14] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[15] = 0x00010001;
+        }
+
+        status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYtoShortSub16_2x8", &uniConvertYtoShortSub16_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
+        status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
+
+        if (resize >= 0.25)
+        {
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8);
+        }
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        switch( attr[1]->dtype )
+        {
+        case U8:
+        case I8:
+        case I16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case F16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    return status;
+} /* _pre_process_nv12_rggb_initializer() */
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    const vsi_nn_kernel_param_t * params,
+    int32_t scale_x
+    )
+{
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    vsi_nn_kernel_convert_type_e convert_type = SCALE;
+    vsi_status status = VSI_FAILURE;
+    uint32_t key = 0;
+    size_t i = 0;
+    vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
+    vsi_size_t dstWidth = outputs[0]->attr.size[0];
+    float scaleVal = (float)dstWidth / ((scale_x * dstWidth) >> 15);
+    uint32_t optFlg = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (enable_copy)
+    {
+        convert_type = COPY;
+    }
+    else
+    {
+        convert_type = SCALE;
+    }
+
+    if (scaleVal >= 0.25 && convert_type == SCALE)
+    {
+        optFlg = 1;
+    }
+
+    key = HASH_PRE_PROCESS_NV12_RGGB_KEY( input0_dtype, output_dtype, convert_type, optFlg );
+
+    for ( i = 0; i < _cnt_of_array(pre_process_nv12_rggb_map); i ++ )
+    {
+        if ( pre_process_nv12_rggb_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(pre_process_nv12_rggb_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_nv12_rggb_map[i].function_name );
+        kernel->info.parameters = vxPreProcessNv12_RGGBKernel_param_def;
+        kernel->info.numParams = _cnt_of_array( vxPreProcessNv12_RGGBKernel_param_def );
+
+        if (convert_type == COPY)
+        {
+            kernel->info.initialize = _pre_process_nv12_rggb_copy_initializer;
+        }
+        else
+        {
+            kernel->info.initialize = _pre_process_nv12_rggb_initializer;
+        }
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                pre_process_nv12_rggb_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                pre_process_nv12_rggb_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t trans = 0;
+    int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, params, scale_x );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 3;
+            int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
+            int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
+            int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
+            float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
+            float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
+            float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
+            float r_scale    = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+            float g_scale    = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+            float b_scale    = vsi_nn_kernel_param_get_float32( params, "b_scale" );
+            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
+            int32_t nv_type  = vsi_nn_kernel_param_get_int32( params, "nv_type" );
+
+            /* Pass parameters to node. */
+            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM,
+                    inputs, 2, outputs, 1 );
+
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &nv_type );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
+            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &tmp_params[3] );
+            vsi_nn_kernel_scalar_release( &tmp_params[4] );
+            vsi_nn_kernel_scalar_release( &tmp_params[5] );
+            vsi_nn_kernel_scalar_release( &tmp_params[6] );
+            vsi_nn_kernel_scalar_release( &tmp_params[7] );
+            vsi_nn_kernel_scalar_release( &tmp_params[8] );
+            vsi_nn_kernel_scalar_release( &tmp_params[9] );
+            vsi_nn_kernel_scalar_release( &tmp_params[10] );
+            vsi_nn_kernel_scalar_release( &tmp_params[11] );
+            vsi_nn_kernel_scalar_release( &tmp_params[12] );
+            vsi_nn_kernel_scalar_release( &tmp_params[13] );
+            vsi_nn_kernel_scalar_release( &tmp_params[14] );
+            vsi_nn_kernel_scalar_release( &tmp_params[15] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( pre_process_nv12_rggb, _setup )
+
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@ -403,23 +403,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)

    out_shape  = attr[0]->shape;
    width      = (uint32_t)(out_shape->data[0]);
-
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if ( attr[0]->dfp.fl > 0 )
-        {
-            output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_zp = (float)attr[0]->asymm.zero_point;
-        output_scale /= attr[0]->asymm.scale;
-    }
+    output_zp  = (float)attr[0]->zero_point;
+    output_scale = 1.0f / attr[0]->scale;

    shaderParam.global_scale[0]  = 16;
    shaderParam.global_scale[1]  = 1;
@ -620,8 +605,8 @@ OnError:
    }
    if (attr[1])
    {
-        vsi_nn_kernel_tensor_attr_release( &attr[0] );
-        attr[0] = NULL;
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
    }

    return status;
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
@ -463,22 +463,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
    width = (uint32_t)(out_shape->data[0] / 3);
    height = (uint32_t)(out_shape->data[1]);

-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if ( attr[0]->dfp.fl > 0 )
-        {
-            output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_zp = (float)attr[0]->asymm.zero_point;
-        output_scale /= attr[0]->asymm.scale;
-    }
+    output_zp = (float)attr[0]->zero_point;
+    output_scale = 1.0f / attr[0]->scale;

    if (attr[0]->dtype == F16 || attr[0]->dtype == I16)
    {
@ -787,8 +773,8 @@ OnError:
    }
    if (attr[1])
    {
-        vsi_nn_kernel_tensor_attr_release( &attr[0] );
-        attr[0] = NULL;
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
    }

    return status;
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
@ -179,28 +179,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
    }
    enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));

-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        outputZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        outputScale = 1.0f / attr[0]->asymm.scale;
-        outputZP = (float)attr[0]->asymm.zero_point;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale = 1;
-        outputZP = 0;
-    }
+    outputScale = 1.0f / attr[0]->scale;
+    outputZP = (float)attr[0]->zero_point;

 #define _PACK_SELECT_KEY( COPY_FLAG, REVERSE_FLAG, TRANS_FLAG)    \
        (COPY_FLAG | (REVERSE_FLAG << 24) | (TRANS_FLAG << 16) )
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
@ -143,23 +143,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
        order1 = 0;
    }

-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        dstScale = 1.0f / attr[0]->asymm.scale;
-        dstZP = attr[0]->asymm.zero_point;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0;
-    }
+    dstScale = 1.0f / attr[0]->scale;
+    dstZP = attr[0]->zero_point;

    shaderParam.global_scale[0]  = 16;
    shaderParam.global_scale[1]  = 1;
@ -501,8 +486,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
-    dstZP      = attr[0]->asymm.zero_point;
-    dstScale   = attr[0]->asymm.scale;
+    dstZP      = attr[0]->zero_point;
+    dstScale   = 1.0f / attr[0]->scale;
    width      = (uint32_t)(out_shape->data[0]);
    height     = (uint32_t)(out_shape->data[1]);

@ -512,28 +497,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
        order1 = 0;
    }

-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        dstScale = 1.0f / dstScale;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
-        dstZP = 0;
-    }
-
    shaderParam.global_scale[0]  = 4;
    shaderParam.global_scale[1]  = 1;
    shaderParam.global_scale[2]  = 1;
--- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
@ -164,46 +164,24 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );

-    out_shape  = attr[2]->shape;
+    out_shape     = attr[2]->shape;
+    inputZP0      = attr[0]->zero_point;
+    input_scale0  = attr[0]->scale;
+    inputZP1      = attr[1]->zero_point;
+    input_scale1  = attr[1]->scale;
+    outputZP      = (float)attr[2]->zero_point;
+    input_scale0  = input_scale0 / attr[2]->scale;
+
    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        in0_fl = (int8_t)attr[0]->dfp.fl;
-        if (in0_fl >= 0)
-        {
-            input_scale0 = 1.0f / (vx_float32) ((int64_t)1 << in0_fl);
-        }
-        else if (in0_fl < 0)
-        {
-            input_scale0 = (vx_float32) ((int64_t)1 << -in0_fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputZP0    = attr[0]->asymm.zero_point;
-        input_scale0  = attr[0]->asymm.scale;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputZP1 = attr[1]->asymm.zero_point;
-        input_scale1  = attr[1]->asymm.scale;
    }

    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        out_fl = (int8_t)attr[2]->dfp.fl;
+    }

-        if (out_fl >= 0)
-            input_scale0 *= (vx_float32)((int64_t)1 << out_fl);
-        else if (out_fl < 0)
-            input_scale0 *= 1.0f / (vx_float32) ((int64_t)1 << -out_fl);
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        out_fl = 1;
-        outputZP      = (float)attr[2]->asymm.zero_point;
-        input_scale0   = input_scale0 / attr[2]->asymm.scale;
-    }
    shift0 = in0_fl - out_fl;

    is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);
--- a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
@ -152,7 +152,6 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
    vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
    vsi_size_array_t * input_shape              = NULL;
    vsi_size_array_t * output_shape             = NULL;
-    int32_t  input_fl = 0, output_fl = 0;
    int32_t  axisSize = 0;
    float    inputScale                        = 1.0f;
    float    input_offset_asymmetric           = 0.0f;
@ -257,68 +256,19 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
        }
    }

-    if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input_fl = input_attr->dfp.fl;
-        if (input_fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -input_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale              = input_attr->asymm.scale;
-        input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        inputScale              = 1.0f;
-        input_offset_asymmetric = 0;
+    inputScale              = input_attr->scale;
+    input_offset_asymmetric = (float)(input_attr->zero_point);
+    outputScale              = 1.0f / output_attr->scale;
+    output_offset_asymmetric = (float)(output_attr->zero_point);

-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
+    status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );

-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outputScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale              = 1.0f / output_attr->asymm.scale;
-        output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        outputScale              = 1.0f;
-        output_offset_asymmetric = 0;
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
    status  = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize );
    CHECK_STATUS_FAIL_GOTO(status, final );
    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
--- a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
@ -154,7 +154,6 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
    vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
    vsi_size_array_t * input_shape              = NULL;
    vsi_size_array_t * output_shape             = NULL;
-    int32_t  input_fl = 0, output_fl = 0;
    int32_t  axisSize = 0;
    float    inputScale                        = 1.0f;
    float    input_offset_asymmetric           = 0.0f;
@ -259,68 +258,18 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
        }
    }

-    if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input_fl = input_attr->dfp.fl;
-        if (input_fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -input_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale              = input_attr->asymm.scale;
-        input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        inputScale              = 1.0f;
-        input_offset_asymmetric = 0;
+    inputScale              = input_attr->scale;
+    input_offset_asymmetric = (float)(input_attr->zero_point);
+    outputScale              = 1.0f / output_attr->scale;
+    output_offset_asymmetric = (float)(output_attr->zero_point);

-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
+    status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );

-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outputScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale              = 1.0f / output_attr->asymm.scale;
-        output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        outputScale              = 1.0f;
-        output_offset_asymmetric = 0;
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
+    status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );

    status  = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize );
    CHECK_STATUS_FAIL_GOTO(status, final );
--- a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
@ -160,7 +160,6 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
    vsi_size_array_t * output_shape             = NULL;
    vsi_nn_kernel_dtype_e src_dtype            = F16;
    vsi_nn_kernel_dtype_e dst_dtype            = F16;
-    int32_t  input_fl = 0, output_fl = 0;
    int32_t  axisSize = 0;
    float    inputScale                        = 1.0f;
    float    input_offset_asymmetric           = 0.0f;
@ -348,68 +347,17 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
        CHECK_STATUS_FAIL_GOTO(status, final );
    }

-    if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input_fl = input_attr->dfp.fl;
-        if (input_fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -input_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale              = input_attr->asymm.scale;
-        input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        inputScale              = 1.0f;
-        input_offset_asymmetric = 0;
+    inputScale              = input_attr->scale;
+    input_offset_asymmetric = (float)(input_attr->zero_point);
+    status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );

-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-
-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outputScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale              = 1.0f / output_attr->asymm.scale;
-        output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        outputScale              = 1.0f;
-        output_offset_asymmetric = 0;
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
+    outputScale              = 1.0f / output_attr->scale;
+    output_offset_asymmetric = (float)(output_attr->zero_point);
+    status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );


    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
--- a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c
@ -138,8 +138,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
    float                         inputTail      = 0.0f;
    float                         output_ZP      = 0;
    float                         input_ZP       = 0;
-    int32_t                       srcFixPointPos = 0;
-    int32_t                       dstFixPointPos = 0;

    VSI_UNREFERENCED(param_size);

@ -154,25 +152,10 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
    output_dtype = output_attr->dtype;
    offset       = alpha * threshold;

-    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
-    {
-        srcFixPointPos   = input_attr->dfp.fl;
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
-    {
-        input_ZP         = (float)(input_attr->asymm.zero_point);
-        scaleIn          = input_attr->asymm.scale;
-    }
-
-    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
-    {
-        dstFixPointPos   = output_attr->dfp.fl;
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
-    {
-        output_ZP        = (float)(output_attr->asymm.zero_point);
-        scaleOut         = 1.0f / output_attr->asymm.scale;
-    }
+    input_ZP     = (float)(input_attr->zero_point);
+    scaleIn      = input_attr->scale;
+    output_ZP    = (float)(output_attr->zero_point);
+    scaleOut     = 1.0f / output_attr->scale;

    gpu_param.global_scale[0]  = 8;
    gpu_param.global_scale[1]  = 1;
@ -195,11 +178,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
    }
    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
    {
-        if (srcFixPointPos >=0 )
-            scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
-        else
-            scaleIn = (float) ((int64_t)1 << -srcFixPointPos);
-
        status = vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
@ -212,11 +190,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
    }
    else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
    {
-        if (dstFixPointPos >=0 )
-            scaleOut = (float) ((int64_t)1 << dstFixPointPos);
-        else
-            scaleOut = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
-
        status  = vsi_nn_kernel_gpu_add_param(node, "output_scale", &scaleOut);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
--- a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
@ -197,8 +197,6 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
    int32_t half_pixel_centers = 0;

    uint32_t    depth              = 0;
-    int32_t     srcFixPointPos     = 0;
-    int32_t     dstFixPointPos     = 0;
    float       input_scale        = 1.0;
    int32_t     inputZP            = 0;
    float       output_scale       = 1.0;
@ -259,53 +257,10 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
        half_pixel_value = 0.0f;
    }

-    if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input_scale    = input_attr->asymm.scale;
-        inputZP        = input_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
-    {
-        srcFixPointPos   = input_attr->dfp.fl;
-        if (srcFixPointPos >= 0)
-        {
-            input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        }
-        else if (srcFixPointPos < 0)
-        {
-            input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos);
-        }
-        inputZP = 0;
-    }
-    else
-    {
-        input_scale = 1.0f;
-        inputZP     = 0;
-    }
-
-    if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_scale   = output_attr->asymm.scale;
-        outputZP       = output_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
-    {
-        dstFixPointPos = output_attr->dfp.fl;
-        if (dstFixPointPos >= 0)
-        {
-            output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos);
-        }
-        else if (dstFixPointPos < 0)
-        {
-            output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
-        }
-        outputZP = 0;
-    }
-    else
-    {
-        output_scale = 1.0;
-        outputZP     = 0;
-    }
+    input_scale    = input_attr->scale;
+    inputZP        = input_attr->zero_point;
+    output_scale   = output_attr->scale;
+    outputZP       = output_attr->zero_point;

    if (is_run_nx_kernel)
    {
@ -473,7 +428,7 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
    }
    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
    {
-        float dfpScale = input_scale * output_scale;
+        float dfpScale = input_scale / output_scale;
        gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
            0x01010101, // TCfg
            0x00000000, // ASelt
--- a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
@ -198,52 +198,19 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
        half_pixel_value = 0.0f;
    }

-    if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input_scale    = input_attr->asymm.scale;
-        inputZP        = input_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
+    input_scale    = input_attr->scale;
+    inputZP        = input_attr->zero_point;
+    output_scale   = 1.0f / output_attr->scale;
+    outputZP       = output_attr->zero_point;
+
+    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
    {
        srcFixPointPos   = input_attr->dfp.fl;
-        if (srcFixPointPos >= 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
-        }
-        else if (srcFixPointPos < 0)
-        {
-            input_scale = (float)((int64_t)1 << -srcFixPointPos);
-        }
-        inputZP = 0;
-    }
-    else
-    {
-        input_scale = 1.0f;
-        inputZP     = 0;
    }

-    if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_scale   = 1.0f / output_attr->asymm.scale;
-        outputZP       = output_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
+    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
    {
        dstFixPointPos = output_attr->dfp.fl;
-        if (dstFixPointPos >= 0)
-        {
-            output_scale = (float) ((int64_t)1 << dstFixPointPos);
-        }
-        else if (dstFixPointPos < 0)
-        {
-            output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
-        }
-        outputZP = 0;
-    }
-    else
-    {
-        output_scale = 1.0;
-        outputZP     = 0;
    }

    if (F16 == input_dtype && F16 == output_dtype)
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@ -122,12 +122,16 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
    PACK_KERNEL_MAP_DOWN(I16, I16),
    PACK_KERNEL_MAP_DOWN(U8, F16),
    PACK_KERNEL_MAP_DOWN(U8, U8),
+    PACK_KERNEL_MAP_DOWN(U16, F16),
+    PACK_KERNEL_MAP_DOWN(U16, U16),
    PACK_KERNEL_MAP_DOWN(F16, F16),
    PACK_KERNEL_MAP_DOWN(F16, U8),
+    PACK_KERNEL_MAP_DOWN(F16, U16),
    PACK_KERNEL_MAP_DOWN(BF16, BF16),
    PACK_KERNEL_MAP_UP(I8, I8),
    PACK_KERNEL_MAP_UP(I16, I16),
    PACK_KERNEL_MAP_UP(U8, U8),
+    PACK_KERNEL_MAP_UP(U16, U16),
    PACK_KERNEL_MAP_UP(F16, F16),
    PACK_KERNEL_MAP_UP(BF16, BF16),
    PACK_KERNEL_MAP_UP_OPT(U8, U8),
@ -223,8 +227,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
    int32_t half_pixel_centers;

    uint32_t    depth              = 0;
-    int32_t     srcFixPointPos     = 0;
-    int32_t     dstFixPointPos     = 0;
    float       input_scale        = 1.0;
    int32_t     inputZP            = 0;
    float       output_scale       = 1.0;
@ -285,201 +287,16 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        half_pixel_value = 0.0f;
    }

-    if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input_scale    = input_attr->asymm.scale;
-        inputZP        = input_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
-    {
-        srcFixPointPos   = input_attr->dfp.fl;
-        if (srcFixPointPos >= 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
-        }
-        else if (srcFixPointPos < 0)
-        {
-            input_scale = (float)((int64_t)1 << -srcFixPointPos);
-        }
-        inputZP = 0;
-    }
-    else
-    {
-        input_scale = 1.0f;
-        inputZP     = 0;
-    }
-
-    if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_scale   = output_attr->asymm.scale;
-        outputZP       = output_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
-    {
-        dstFixPointPos = output_attr->dfp.fl;
-        if (dstFixPointPos >= 0)
-        {
-            output_scale = (float) ((int64_t)1 << dstFixPointPos);
-        }
-        else if (dstFixPointPos < 0)
-        {
-            output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
-        }
-        outputZP = 0;
-    }
-    else
-    {
-        output_scale = 1.0;
-        outputZP     = 0;
-    }
+    input_scale    = input_attr->scale;
+    inputZP        = input_attr->zero_point;
+    output_scale   = output_attr->scale;
+    outputZP       = output_attr->zero_point;

    gpu_param.global_scale[0] = 4;
    gpu_param.global_scale[1] = 1;
    gpu_param.global_scale[2] = 1;

-    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
-    {
-        float dfpScale = input_scale * output_scale;
-        gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000300, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000,
-            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniExtact8Bit_2x8 = {{
-            0x33333333, // TCfg
-            0x11110000, // ASelt
-            0x03020100, 0x03020100, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniRightSubLeft_4x4 = {{
-            0x09090909, // TCfg
-            0x00000000, // ASelt
-            0x00230001, 0x00670045, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00000000, 0x00010001, 0x00000000,
-            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniDFPtoFp32_left_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00020000, 0x00060004, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000,
-            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-
-        if (I8 == input_dtype && I8 == output_dtype && out_width > in_width)
-        {
-            gpu_dp_inst_t uniConvertI32toI16_2x8 = {{
-                0x33333333, // TCfg
-                0x11110000, // ASelt
-                0x03020100, 0x03020100, // ABin
-                0x00000000, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00002400, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-            gpu_dp_inst_t uniGetMaskShift_2x8 = {{
-                0x99999999, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x55555555, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000400, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-            gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{
-                0x09090909, // TCfg
-                0x00000000, // ASelt
-                0x00150004, 0x00370026, // ABin
-                0x0a0a0a0a, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000300, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00000000, 0x00010001, 0x00000000,
-                0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-
-            status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
-                                                 &uniConvertDFP2FP32_part1_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
-            CHECK_STATUS_FAIL_GOTO(status, final );
-
-            gpu_param.global_scale[2] = depth;
-        }
-        else if (I16 == input_dtype && I16 == output_dtype && out_width > in_width)
-        {
-            gpu_dp_inst_t uniConvertI32toI16_2x8 = {{
-                0x33333333, // TCfg
-                0x11110000, // ASelt
-                0x03020100, 0x03020100, // ABin
-                0x00000000, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00002400, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-            gpu_dp_inst_t uniGetMaskShift_2x8 = {{
-                0x99999999, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x55555555, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000400, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-            gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{
-                0x09090909, // TCfg
-                0x00000000, // ASelt
-                0x00150004, 0x00370026, // ABin
-                0x0a0a0a0a, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000300, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00000000, 0x00010001, 0x00000000,
-                0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-
-            status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
-                                                 &uniConvertDFP2FP32_part1_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
-            CHECK_STATUS_FAIL_GOTO(status, final );
-
-            gpu_param.global_scale[2] = depth;
-        }
-        else
-        {
-            status  = vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniDFPtoFp32_left_4x4);
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
-        status |= vsi_nn_kernel_gpu_add_param( node, "dfpScale", &dfpScale);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (U8 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
+    if ((U8 == input_dtype || U16 == input_dtype || I8 == input_dtype || I16 == input_dtype))
    {
        float   uint8Scale             = input_scale / output_scale;
        float   uint8ZP_out            = (float)outputZP;
@ -615,7 +432,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        }
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
-    else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
+    else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype || U16 == output_dtype))
    {
        float   uint8Scale             = 1.0f / output_scale;
        float   uint8ZP_out            = (float)outputZP;
--- a/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c
@ -0,0 +1,453 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define _RESIZE_CUBIC_KERNEL_SOURCE()      "resize_cubic"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) )
+
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+          CVIVANTE_NAMESPACE("evis.resize_cubic_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          _RESIZE_CUBIC_KERNEL_SOURCE() }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _resize_cubic_kernel_map[] =
+{
+    PACK_KERNEL_MAP(F16, F16),
+    PACK_KERNEL_MAP(I16, I16),
+    PACK_KERNEL_MAP(F16, I16),
+    PACK_KERNEL_MAP(I16, F16),
+    PACK_KERNEL_MAP(I8,  I8),
+    PACK_KERNEL_MAP(F16, I8),
+    PACK_KERNEL_MAP(I8,  F16),
+    PACK_KERNEL_MAP(U8,  U8),
+    PACK_KERNEL_MAP(F16, U8),
+    PACK_KERNEL_MAP(U8,  F16),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _resize_cubic_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define RESIZE_CUBIC_NUM   _cnt_of_array( _resize_cubic_kernel_param_def )
+
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_resize_cubic_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t *input_attr     = NULL;
+    vsi_nn_kernel_tensor_attr_t *output_attr    = NULL;
+    vsi_size_array_t * out_shape                = NULL;
+
+    float       input_scale        = 1.0;
+    float       input_tail         = 0;
+    float       output_scale       = 1.0;
+    float       output_tail        = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0]);
+    CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+    output_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1]);
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    out_shape  = output_attr->shape;
+
+    if ( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        int32_t fl = input_attr->dfp.fl;
+        if (fl > 0)
+        {
+            input_scale = 1.0f / (float) ((int64_t)1 << fl);
+        }
+        else
+        {
+            input_scale = (float)((int64_t)1 << -fl);
+        }
+    }
+    else if ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        input_scale = input_attr->asymm.scale;
+        input_tail  = 0 - input_scale * (float)input_attr->asymm.zero_point;
+    }
+
+    if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        int32_t fl = output_attr->dfp.fl;
+        if (fl > 0)
+        {
+            output_scale = (float) ((int64_t)1 << fl);
+        }
+        else
+        {
+            output_scale = 1.0f / (float)((int64_t)1 << -fl);
+        }
+    }
+    else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        output_scale = 1.0f / output_attr->asymm.scale;
+        output_tail    = (float)output_attr->asymm.zero_point;
+    }
+
+    gpu_param.global_scale[0]  = 4;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    {
+        gpu_dp_inst_t uniFp16ToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniExtract8Bit_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16};
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniFp16ToFp32_4x4", &uniFp16ToFp32_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8);
+    }
+    status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale);
+    status |= vsi_nn_kernel_gpu_add_param( node, "input_tail", &input_tail);
+    status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
+    status |= vsi_nn_kernel_gpu_add_param( node, "output_tail", &output_tail);
+
+
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
+    if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
+    return status;
+} /* _resize_cubic_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _resize_cubic_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _resize_cubic_kernel_map );
+    vx_param_description_t * param_def  = _resize_cubic_kernel_param_def;
+    size_t param_def_size               = RESIZE_CUBIC_NUM;
+    vx_kernel_initialize_f  initializer = _resize_cubic_initializer;
+
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = RESIZE_CUBIC_HASH_KEY( in_dtype, out_dtype );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_tensor_t* _create_scale_tensor
+    (
+    vsi_nn_graph_t  *graph,
+    vsi_size_t       output_size,
+    float            scale_factor,
+    float            half_pixel_value,
+    vsi_nn_tensor_t** index
+    )
+{
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_tensor_t*  scale           = NULL;
+    vsi_size_t   i                    = 0;
+    float       *scale_data_ptr       = NULL;
+    int         *index_data_ptr       = NULL;
+    float        scale_value          = 0;
+    vsi_ssize_t  data                 = 0;
+    int          idx                  = 0;
+    float        delta_v              = 0;
+    float        cubic_coeff_a        = -0.5f;
+    vsi_size_t   item_count           = 4 * output_size;
+    scale_data_ptr = (float *)malloc(item_count * sizeof(float));
+    if (scale_data_ptr == NULL)
+    {
+        VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__);
+        goto OnError;
+    }
+
+    index_data_ptr = (int *)malloc(output_size * sizeof(int));
+    if (index_data_ptr == NULL)
+    {
+        VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__);
+        goto OnError;
+    }
+
+    for (i = 0; i < output_size; i ++)
+    {
+        scale_value = ((float)i + half_pixel_value) * scale_factor - half_pixel_value;
+        data = (vsi_ssize_t)scale_value;
+        delta_v = scale_value - (float)data;
+        idx   = (int)data - 1;
+
+        index_data_ptr[i] = idx;
+        scale_data_ptr[i * 4 + 0] = cubic_coeff_a * (((delta_v - 4) * (delta_v + 1) + 8) * (delta_v + 1) - 4);
+        scale_data_ptr[i * 4 + 1] = ((cubic_coeff_a + 2) * delta_v - (cubic_coeff_a + 3)) * delta_v *delta_v + 1;
+        scale_data_ptr[i * 4 + 2] = ((cubic_coeff_a + 2) * (1 - delta_v) - (cubic_coeff_a + 3))
+                                  * (1 - delta_v) * (1 - delta_v) + 1;
+        scale_data_ptr[i * 4 + 3] = cubic_coeff_a * ((( 2 - delta_v - 5) * (2 - delta_v) + 8) * (2 - delta_v) - 4);
+    }
+    attr.size[0] = item_count;
+    attr.dim_num = 1;
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    attr.vtl = FALSE;
+
+    scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr);
+    if (scale_data_ptr)
+    {
+        free (scale_data_ptr);
+        scale_data_ptr = NULL;
+    }
+
+    attr.size[0] = output_size;
+    attr.dim_num = 1;
+    attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+    attr.vtl = FALSE;
+
+    *index = vsi_nn_CreateTensorFromData(graph, (uint8_t *)index_data_ptr, &attr);
+    if (index_data_ptr)
+    {
+        free (index_data_ptr);
+        index_data_ptr = NULL;
+    }
+
+OnError:
+    return scale;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[RESIZE_CUBIC_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+    vsi_size_t in_width     = inputs[0]->attr.size[0];
+    vsi_size_t in_height    = inputs[0]->attr.size[1];
+    vsi_size_t out_width    = outputs[0]->attr.size[0];
+    vsi_size_t out_height   = outputs[0]->attr.size[1];
+    float   half_pixel_value = 0.0f;
+    float   width_scale = 0.0f;
+    float   height_scale = 0.0f;
+    vsi_nn_tensor_t* scale_w = NULL;
+    vsi_nn_tensor_t* scale_h = NULL;
+    vsi_nn_tensor_t* index_w = NULL;
+    vsi_nn_tensor_t* index_h = NULL;
+
+    if (align_corners && out_width > 1)
+    {
+        width_scale = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
+    }
+    else
+    {
+        width_scale = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
+    }
+
+    if (align_corners && out_height > 1)
+    {
+        height_scale = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
+    }
+    else
+    {
+        height_scale = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
+    }
+
+    if (half_pixel_centers)
+    {
+        half_pixel_value = 0.5f;
+    }
+    else
+    {
+        half_pixel_value = 0.0f;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            size_t node_params_num = RESIZE_CUBIC_NUM;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, RESIZE_CUBIC_NUM,
+                    inputs, input_num, outputs, output_num );
+            scale_w = _create_scale_tensor(graph, out_width,\
+                              width_scale, half_pixel_value, &index_w);
+            CHECK_PTR_FAIL_GOTO( scale_w, "Create buffer fail.", final );
+            CHECK_PTR_FAIL_GOTO( index_w, "Create buffer fail.", final );
+            scale_h = _create_scale_tensor(graph, out_height,\
+                              height_scale, half_pixel_value, &index_h);
+            CHECK_PTR_FAIL_GOTO( scale_h, "Create buffer fail.", final );
+            CHECK_PTR_FAIL_GOTO( index_h, "Create buffer fail.", final );
+            node_params[2] = (vsi_nn_kernel_node_param_t)(scale_w->t);
+            node_params[3] = (vsi_nn_kernel_node_param_t)(scale_h->t);
+            node_params[4] = (vsi_nn_kernel_node_param_t)(index_w->t);
+            node_params[5] = (vsi_nn_kernel_node_param_t)(index_h->t);
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+        }
+    }
+
+final:
+    vsi_safe_release_tensor(scale_w);
+    vsi_safe_release_tensor(scale_h);
+    vsi_safe_release_tensor(index_w);
+    vsi_safe_release_tensor(index_h);
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( resize_cubic, _setup )
--- a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
@ -208,52 +208,19 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
        half_pixel_value = 0.0f;
    }

-    if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input_scale    = input_attr->asymm.scale;
-        inputZP        = input_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
+    input_scale    = input_attr->scale;
+    inputZP        = input_attr->zero_point;
+    output_scale   = 1.0f / output_attr->scale;
+    outputZP       = output_attr->zero_point;
+
+    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
    {
        srcFixPointPos   = input_attr->dfp.fl;
-        if (srcFixPointPos >= 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
-        }
-        else if (srcFixPointPos < 0)
-        {
-            input_scale = (float)((int64_t)1 << -srcFixPointPos);
-        }
-        inputZP = 0;
-    }
-    else
-    {
-        input_scale = 1.0f;
-        inputZP     = 0;
    }

-    if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_scale   = 1.0f / output_attr->asymm.scale;
-        outputZP       = output_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
+    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
    {
        dstFixPointPos = output_attr->dfp.fl;
-        if (dstFixPointPos >= 0)
-        {
-            output_scale = (float) ((int64_t)1 << dstFixPointPos);
-        }
-        else if (dstFixPointPos < 0)
-        {
-            output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
-        }
-        outputZP = 0;
-    }
-    else
-    {
-        output_scale = 1.0;
-        outputZP     = 0;
    }

    if (F16 == input_dtype && F16 == output_dtype)
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
@ -208,10 +208,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
    height     = (int32_t)(attr[2]->shape->data[1]);
    index_num  = (int32_t)(attr[0]->shape->data[1]);

-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_zp = attr[2]->asymm.zero_point;
-    }
+    output_zp = attr[2]->zero_point;

    if (coord_dim == 3)
    {
@ -367,10 +364,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer)
    height     = (int32_t)(attr[2]->shape->data[1]);
    index_num  = (int32_t)(attr[0]->shape->data[1]);

-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_zp = attr[2]->asymm.zero_point;
-    }
+    output_zp = attr[2]->zero_point;

    if (coord_dim == 3)
    {
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@ -382,6 +382,12 @@ static vsi_status check_scatter_nd_update_index_repeat
    int32_t* mask_buffer = NULL;
    int32_t  mask_len = 0;

+    if (indices_num == 1)
+    {
+        isRepeat[0] = 0;
+        return VSI_SUCCESS;
+    }
+
    if (inputs[1]->attr.is_const == FALSE)
    {
        isRepeat[0] = 1;
@ -451,7 +457,7 @@ static vsi_status check_scatter_nd_update_index_repeat
            else if (mask_buffer[mask_idx] > 0)
            {
                isRepeat[0] = 1;
-                status = VSI_FAILURE;
+                status = VSI_SUCCESS;
                CHECK_STATUS_FAIL_GOTO( status, final );
            }
        }
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_reduction_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_reduction_evis.c
@ -0,0 +1,861 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+typedef enum
+{
+    NONE = 0,
+    Add,
+    Mul,
+    Max,
+    Min
+} vsi_scatter_nd_update_type_e;
+
+/*
+ * Define kernel meta.
+ */
+#define KERNEL_SOURCE_1    "scatter_nd_update_reduction"
+#define KERNEL_SOURCE_2    "scatter_nd_update_reduction_conv"
+
+#define HASH_SCATTER_ND_UPDATE_KEY(_in0_type, _in2_type, _out_type, _stage, _op_type) \
+    ((_in0_type << 24) | (_in2_type << 16) | (_out_type << 8) | (_stage << 4) | (_op_type))
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_preprocess_"#SRC0_TYPE)
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, SRC2_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_"#REDUCTION_TYPE"_"#SRC2_TYPE)
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_conv_"#DST_TYPE)
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(IN0_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, 0, 0, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(IN0_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(REDUCTION_TYPE, IN2_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, REDUCTION_TYPE), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, IN2_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 2, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(OUT_TYPE), \
+        SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type scatter_nd_update_reduction_preprocess_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F16,  KERNEL_SOURCE_1)
+};
+
+static const _kernel_map_type scatter_nd_update_reduction_process_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, BF16, KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, BF16, KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, BF16, KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, BF16, KERNEL_SOURCE_1)
+};
+
+static const _kernel_map_type scatter_nd_update_reduction_conv_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(U8,   KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I8,   KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I16,  KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F16,  KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(BF16, KERNEL_SOURCE_2)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _scatter_nd_update_preprocess_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _scatter_nd_update_process_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _scatter_nd_update_conv_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+#define _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM  _cnt_of_array(_scatter_nd_update_preprocess_kernel_param_def)
+#define _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM  _cnt_of_array(_scatter_nd_update_process_kernel_param_def)
+#define _SCATTER_ND_UPDATE_CONV_PARAM_NUM  _cnt_of_array(_scatter_nd_update_conv_kernel_param_def)
+
+static vsi_status get_scatter_nd_update_tensor_reshape_size
+    (
+    vsi_nn_tensor_t ** inputs,
+    vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
+    uint32_t block_size,
+    uint32_t coordDim,
+    vsi_size_t strides[VSI_NN_MAX_DIM_NUM],
+    int32_t* newDim
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    uint32_t dims_num = inputs[0]->attr.dim_num;
+    vsi_size_t *input_size = inputs[0]->attr.size;
+    uint32_t i = 0;
+    vsi_size_t elementCnt = 1;
+
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
+
+    newDim[0] = 0;
+    for (i = 0; i < dims_num; ++i)
+    {
+        elementCnt *= input_size[i];
+    }
+
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    {
+        sizes[i] = 1;
+    }
+
+    sizes[0] = block_size;
+    sizes[1] = elementCnt / block_size;
+    newDim[0] = 2;
+
+    if (coordDim == 1 && strides) // index shape
+    {
+        for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+        {
+            strides[i] = 0;
+        }
+    }
+    else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides)
+    {
+        for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+        {
+            strides[i] = 0;
+        }
+
+        strides[0] = input_size[dims_num - coordDim];
+        for (i = 1; i < coordDim - 1; i++)
+        {
+            strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i];
+        }
+    }
+
+#undef VSI_NN_MAX_IMAGE_WIDTH
+
+    return status;
+} /* _get_EltOP_tensor_reshape_size */
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_preprocess_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        1,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t     width         = 0;
+    int32_t     element_size  = 1;
+    int32_t     input_zp0     = 0;
+    float       input_scale0  = 1;
+    int32_t     i             = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
+    {
+        element_size *= (int32_t)attr[0]->shape->data[i];
+    }
+    width = element_size / 8;
+
+    input_zp0     = attr[0]->zero_point;
+    input_scale0  = attr[0]->scale;
+
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
+    {
+        input_scale0 = 1.0f;
+    }
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    if (element_size < 8)
+    {
+        gpu_param.global_size[0]   = element_size;
+    }
+    else
+    {
+        gpu_param.global_size[0]   = width;
+    }
+    gpu_param.global_size[1]   = 1;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
+                0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert2ndU8SubZpToFp32_4x4 = {{
+                0x09090909, // TCfg
+                0x04040404, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000300, // AccumType, ConstantType, and PostShift
+                0x00010001, 0x00000000, 0x00010001, 0x00000000,
+                0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvert2ndU8SubZpToFp32_4x4", &uniConvert2ndU8SubZpToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale0 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &input_zp0 );
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_preprocess_initializer() */
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_process_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
+    int32_t     block_size = 1;
+    int32_t     update_width = 1;
+    int32_t     index_num  = 1;
+    int32_t     width = 0;
+    int32_t     coord_dim  = 0;
+    int32_t     strides[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t     coord_strides[8]  = {0};
+    int32_t     coord_strides1[4] = {0};
+    int32_t     input_zp2     = 0;
+    float       input_scale2  = 1;
+    int32_t     i = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &strides[0]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &strides[1]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &strides[2]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &strides[3]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &strides[4]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &strides[5]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &strides[6]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &coord_dim);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    block_size   = (int32_t)(attr[2]->shape->data[0]);
+    update_width = (int32_t)(attr[1]->shape->data[0]);
+    index_num    = (int32_t)(attr[0]->shape->data[1]);
+    width = block_size;
+
+    input_zp2     = attr[1]->zero_point;
+    input_scale2  = attr[1]->scale;
+
+    coord_strides[coord_dim - 1] = 1;
+    for (i = 0; i < coord_dim - 1; i++)
+    {
+        coord_strides[i] = strides[coord_dim - 2 - i];
+    }
+    memcpy(coord_strides1, coord_strides + 4, 4 * sizeof(int32_t));
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = width;
+    gpu_param.global_size[1]   = index_num;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
+                0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
+        status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride", &coord_strides );
+        status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride1", &coord_strides1 );
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+        if (attr[1]->dtype == U8 || attr[1]->dtype == I8 || attr[1]->dtype == I16)
+        {
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvert1stUint8SubZpToFp32_4x4",  &uniConvert1stUint8SubZpToFp32_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node, "update_scale", &input_scale2 );
+            status |= vsi_nn_kernel_gpu_add_param( node, "update_zp", &input_zp2 );
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
+        }
+        else if (attr[1]->dtype == BF16)
+        {
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvBF16toF32_Part0_2x8",  &uniConvBF16toF32_Part0_2x8 );
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
+        }
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_process_initializer() */
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_conv_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        1,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t     width         = 0;
+    int32_t     element_size  = 1;
+    int32_t     i             = 0;
+    float       output_zp     = 0;
+    float       output_scale  = 1.0f;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    output_zp     = (float)attr[0]->zero_point;
+    output_scale  = (float)1.0 / attr[0]->scale;
+
+    for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
+    {
+        element_size *= (int32_t)attr[0]->shape->data[i];
+    }
+    width = element_size / 8;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    if (element_size < 8)
+    {
+        gpu_param.global_size[0]   = element_size;
+    }
+    else
+    {
+        gpu_param.global_size[0]   = width;
+    }
+    gpu_param.global_size[1]   = 1;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale );
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_conv_initializer() */
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel_preprocess,
+    vsi_nn_kernel_t* kernel_process,
+    vsi_nn_kernel_t* kernel_conv,
+    int32_t reduction_flg
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e input2_dtype = F16;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    size_t i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_SCATTER_ND_UPDATE_KEY(input0_dtype, 0, 0, 0, 0);
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_preprocess_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map) )
+    {
+        snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_reduction_preprocess_map[i].function_name );
+        kernel_preprocess->info.parameters = _scatter_nd_update_preprocess_kernel_param_def;
+        kernel_preprocess->info.numParams = _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM;
+        kernel_preprocess->info.initialize = _scatter_nd_update_preprocess_initializer;
+
+        vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                scatter_nd_update_reduction_preprocess_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_preprocess_map[i].source_name );
+    }
+    else
+    {
+        status = VSI_FAILURE;
+    }
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, reduction_flg);
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_process_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_process_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_process_map) )
+    {
+        snprintf( kernel_process->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_reduction_process_map[i].function_name );
+        kernel_process->info.parameters = _scatter_nd_update_process_kernel_param_def;
+        kernel_process->info.numParams = _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM;
+        kernel_process->info.initialize = _scatter_nd_update_process_initializer;
+
+        vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                scatter_nd_update_reduction_process_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_process_map[i].source_name );
+    }
+    else
+    {
+        status |= VSI_FAILURE;
+    }
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 2, 0);
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_conv_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_conv_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_conv_map) )
+    {
+        snprintf( kernel_conv->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_reduction_conv_map[i].function_name );
+        kernel_conv->info.parameters = _scatter_nd_update_conv_kernel_param_def;
+        kernel_conv->info.numParams = _SCATTER_ND_UPDATE_CONV_PARAM_NUM;
+        kernel_conv->info.initialize = _scatter_nd_update_conv_initializer;
+
+        vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                scatter_nd_update_reduction_conv_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_conv_map[i].source_name );
+    }
+    else
+    {
+        status |= VSI_FAILURE;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_size_t  strides[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
+    int32_t coord_dim   = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
+    int32_t reduction  = vsi_nn_kernel_param_get_int32( params, "reduction" );
+    int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
+    int32_t i = 0;
+    vsi_nn_tensor_t * tensors[2] = { NULL };
+    vsi_nn_kernel_t * ikernels[2] = { NULL };
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    status = get_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0,
+                                                    NULL, &rs_idx_dim);
+    status |= get_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0,
+                                                    NULL, &rs_in_dim);
+    status |= get_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
+                                                    strides, &rs_out_dim);
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    {
+        vsi_nn_tensor_attr_t attr;
+        vsi_nn_kernel_node_t preprocess_node = NULL;
+        vsi_nn_kernel_node_t process_node = NULL;
+        vsi_nn_kernel_node_param_t preprocess_params[_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t process_params[_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t conv_params[_SCATTER_ND_UPDATE_CONV_PARAM_NUM] = { NULL };
+        int32_t width = 1;
+        int32_t res = 0;
+
+        ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+        ikernels[0]->unique_id = kernel->unique_id;
+        ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+        ikernels[1]->unique_id = kernel->unique_id;
+
+        memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
+        attr.dtype = outputs[0]->attr.dtype;
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+        attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+
+        for (i = 0; i < rs_out_dim; i++)
+        {
+            attr.size[i] = shapes[2][i];
+            width *= (int32_t)shapes[2][i];
+        }
+        attr.dim_num = rs_out_dim;
+
+        res = width % 8;
+        width = (width >> 3) << 3;
+
+        tensors[0] = vsi_nn_CreateTensor( graph, &attr );  // ref'
+        attr.size[0] = 1;
+        attr.size[1] = 1;
+        attr.dim_num = rs_out_dim;
+        tensors[1] = vsi_nn_CreateTensor( graph, &attr );  // link_buffer0
+
+        status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], kernel, reduction);
+        if ( VSI_SUCCESS == status)
+        {
+            // convert ref to float
+            preprocess_node = vsi_nn_kernel_create_node( graph, ikernels[0] );
+            if (preprocess_node)
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                preprocess_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[2], rs_out_dim );
+                preprocess_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
+                status = vsi_nn_kernel_node_pass_param( preprocess_node, preprocess_params,
+                            _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &preprocess_params[0] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[2] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[3] );
+            }
+
+            // update
+            process_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
+            if (process_node)
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[0], rs_idx_dim );
+                process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t,  shapes[1], rs_in_dim );
+                process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[3] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[4] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[5] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[6] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
+                status = vsi_nn_kernel_node_pass_param( process_node, process_params,
+                                _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &process_params[0] );
+                vsi_nn_kernel_tensor_release( &process_params[1] );
+                vsi_nn_kernel_scalar_release( &process_params[4] );
+                vsi_nn_kernel_scalar_release( &process_params[5] );
+                vsi_nn_kernel_scalar_release( &process_params[6] );
+                vsi_nn_kernel_scalar_release( &process_params[7] );
+                vsi_nn_kernel_scalar_release( &process_params[8] );
+                vsi_nn_kernel_scalar_release( &process_params[9] );
+                vsi_nn_kernel_scalar_release( &process_params[10] );
+                vsi_nn_kernel_scalar_release( &process_params[11] );
+            }
+
+            // convert float to output
+            node = vsi_nn_kernel_create_node( graph, kernel );
+            if ( node )
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                conv_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
+                status = vsi_nn_kernel_node_pass_param( node, conv_params, _SCATTER_ND_UPDATE_CONV_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &conv_params[2] );
+                vsi_nn_kernel_scalar_release( &conv_params[3] );
+                vsi_nn_kernel_scalar_release( &conv_params[4] );
+            }
+        }
+
+        if (preprocess_node) {vsi_nn_kernel_node_release( &preprocess_node );}
+        if (process_node) {vsi_nn_kernel_node_release( &process_node );}
+    }
+
+final:
+    if (ikernels[0])
+    {
+        vsi_nn_kernel_release(&ikernels[0]);
+    }
+    if (ikernels[1])
+    {
+        vsi_nn_kernel_release(&ikernels[1]);
+    }
+    vsi_safe_release_tensor(tensors[0]);
+    vsi_safe_release_tensor(tensors[1]);
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( scatter_nd_update_reduction, _setup )
--- a/src/tim/vx/internal/src/kernel/evis/select_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c
@ -22,6 +22,7 @@
 *
 *****************************************************************************/

+#if !(VX_TENSOR_SELECT_VX_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -159,7 +160,6 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
    vsi_nn_kernel_tensor_attr_t *input1_attr   = NULL;
    vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
    vsi_size_array_t             *output_shape  = NULL;
-    int32_t  input0_fl = 0, input1_fl = 0, output_fl = 0;
    float    input0Scale                    = 1.0f;
    int32_t  input0Zp                       = 0;
    float    input1Scale                    = 1.0f;
@ -180,59 +180,12 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
    output_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );

-    if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input0_fl = input0_attr->dfp.fl;
-        if (input0_fl > 0)
-        {
-            input0Scale = 1.0f / (float) ((int64_t)1 << input0_fl);
-        }
-        else
-        {
-            input0Scale = (float)((int64_t)1 << -input0_fl);
-        }
-    }
-    else if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input0Scale = input0_attr->asymm.scale;
-        input0Zp    = input0_attr->asymm.zero_point;
-    }
-
-    if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input1_fl = input1_attr->dfp.fl;
-        if (input1_fl > 0)
-        {
-            input1Scale = 1.0f / (float) ((int64_t)1 << input1_fl);
-        }
-        else
-        {
-            input1Scale = (float)((int64_t)1 << -input1_fl);
-        }
-    }
-    else if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input1Scale = input1_attr->asymm.scale;
-        input1Zp    = input1_attr->asymm.zero_point;
-    }
-
-    if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outputScale = 1.0f / (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = (float)((int64_t)1 << -output_fl);
-        }
-    }
-    else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale = output_attr->asymm.scale;
-        outputZP    = output_attr->asymm.zero_point;
-    }
+    input0Scale = input0_attr->scale;
+    input0Zp    = input0_attr->zero_point;
+    input1Scale = input1_attr->scale;
+    input1Zp    = input1_attr->zero_point;
+    outputScale = output_attr->scale;
+    outputZP    = output_attr->zero_point;

    gpu_quantize_multiplier_16bit(input0Scale / outputScale, &in0_M0, &in0_postShift);
    gpu_quantize_multiplier_16bit(input1Scale / outputScale, &in1_M0, &in1_postShift);
@ -541,3 +494,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_EVIS( select, _setup )
+#endif
--- a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
@ -131,42 +131,10 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );

    out_shape  = attr[1]->shape;
-
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        input_zp     = attr[0]->asymm.zero_point;
-        scaleIn      = attr[0]->asymm.scale;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        input_zp = 0;
-    }
-
-    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_zp    = attr[1]->asymm.zero_point;
-        scaleOut     = 1.0f / attr[1]->asymm.scale;
-    }
-    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            scaleOut = (float)((int64_t)1 << attr[1]->dfp.fl);
-        }
-        else
-        {
-            scaleOut = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
-        }
-        output_zp = 0;
-    }
+    input_zp   = attr[0]->zero_point;
+    scaleIn    = attr[0]->scale;
+    output_zp  = attr[1]->zero_point;
+    scaleOut   = 1.0f / attr[1]->scale;

    outputVal1 = scaleOut + (float)output_zp;

--- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
@ -157,8 +157,6 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
    float     scaleOut        = 1.0f;
    int32_t   output_ZP       = 0;
    int32_t   input_ZP        = 0;
-    int32_t   srcFixPointPos  = 0;
-    int32_t   dstFixPointPos  = 0;
    int32_t   is_samefl       = 0;
    uint32_t  pack_key        = 0;

@ -178,41 +176,10 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)

    pack_key = _PACK_SLICE_KEY( input_dtype, output_dtype, is_samefl);

-    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
-    {
-        srcFixPointPos   = input_attr->dfp.fl;
-        if (srcFixPointPos > 0)
-        {
-            scaleIn = (1.0f / ((float) ((int64_t)1 << srcFixPointPos)));
-        }
-        else
-        {
-            scaleIn = ((float) ((int64_t)1 << -srcFixPointPos));
-        }
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
-    {
-        input_ZP         = input_attr->asymm.zero_point;
-        scaleIn          = input_attr->asymm.scale;
-    }
-
-    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
-    {
-        dstFixPointPos   = output_attr->dfp.fl;
-        if (dstFixPointPos > 0)
-        {
-            scaleOut = (1.0f / ((float) ((int64_t)1 << dstFixPointPos)));
-        }
-        else
-        {
-            scaleOut = ((float) ((int64_t)1 << -dstFixPointPos));
-        }
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
-    {
-        output_ZP        = output_attr->asymm.zero_point;
-        scaleOut         = output_attr->asymm.scale;
-    }
+    input_ZP         = input_attr->zero_point;
+    scaleIn          = input_attr->scale;
+    output_ZP        = output_attr->zero_point;
+    scaleOut         = output_attr->scale;

    if ((I8 == input_dtype && input_dtype == output_dtype ) ||
        (U8 == input_dtype && input_dtype == output_dtype ) )
--- a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
@ -170,23 +170,8 @@ DEF_KERNEL_INITIALIZER(_get_matrix_initializer)
    attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );

-    if ( attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr->dfp.fl;
-        if (fl > 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input_scale  = attr->asymm.scale;
-        input_tail = 0 - attr->asymm.zero_point * input_scale;
-    }
+    input_scale  = attr->scale;
+    input_tail = 0 - attr->zero_point * input_scale;

    in_shape  = attr->shape;

@ -265,42 +250,10 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );

-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input_scale  = attr[0]->asymm.scale;
-        input_tail = 0 - attr[0]->asymm.zero_point * input_scale;
-    }
-
-    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        int32_t fl = attr[1]->dfp.fl;
-
-        if (fl >= 0)
-        {
-            output_scale = (vx_float32) ((vx_int64)1 << fl);
-        }
-        else if (fl < 0)
-        {
-            output_scale = 1.0f / (vx_float32) ((vx_int64)1 << -fl);
-        }
-    }
-    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_scale   = 1.0f / attr[1]->asymm.scale;
-        output_zp = (float)attr[1]->asymm.zero_point;
-    }
+    input_scale  = attr[0]->scale;
+    input_tail   = 0 - attr[0]->zero_point * input_scale;
+    output_scale = 1.0f / attr[1]->scale;
+    output_zp    = (float)attr[1]->zero_point;

    out_shape  = attr[1]->shape;

--- a/src/tim/vx/internal/src/kernel/evis/swish_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c
@ -166,8 +166,6 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)

    vx_tensor     input            = (vx_tensor)param[0];
    vx_tensor     output           = (vx_tensor)param[1];
-    int8_t        srcFixPointPos   = 0;
-    int8_t        dstFixPointPos   = 0;
    vx_float32    inputTail        = 0;
    vx_float32    inputScale       = 1.0f;
    vx_float32    outputZP         = 0;
@ -186,42 +184,11 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );

    out_shape = output_attr->shape;
+    inputScale  = input_attr->scale;
+    inputTail   = 0 - (vx_float32)input_attr->zero_point * inputScale;
+    outputScale  = 1.0f / output_attr->scale;
+    outputZP     = (vx_float32)(output_attr->zero_point);

-    if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        srcFixPointPos = (int8_t)input_attr->dfp.fl;
-        if (srcFixPointPos > 0)
-        {
-            inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        }
-        else
-        {
-            inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos);
-        }
-    }
-    else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        inputScale  = input_attr->asymm.scale;
-        inputTail   = 0 - input_attr->asymm.zero_point * inputScale;
-    }
-
-    if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        dstFixPointPos = (int8_t)output_attr->dfp.fl;
-        if (dstFixPointPos > 0)
-        {
-            outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos);
-        }
-        else
-        {
-            outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
-        }
-    }
-    else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        outputScale  = 1.0f / output_attr->asymm.scale;
-        outputZP     = (vx_float32)(output_attr->asymm.zero_point);
-    }
 #define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE )    \
        (IN_TYPE | ( OUT_TYPE << 16))

@ -379,8 +346,6 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)

    vx_tensor     input            = (vx_tensor)param[0];
    vx_tensor     output           = (vx_tensor)param[1];
-    int8_t        srcFixPointPos   = 0;
-    int8_t        dstFixPointPos   = 0;
    vx_float32    inputTail        = 0;
    vx_float32    inputScale       = 1.0f;
    vx_float32    outputZP         = 0;
@ -398,42 +363,11 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)
    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );

    out_shape = output_attr->shape;
+    inputScale  = input_attr->scale;
+    inputTail   = 0 - (vx_float32)input_attr->zero_point * inputScale;
+    outputScale  = 1.0f / output_attr->scale;
+    outputZP     = (vx_float32)(output_attr->zero_point);

-    if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        srcFixPointPos = (int8_t)input_attr->dfp.fl;
-        if (srcFixPointPos > 0)
-        {
-            inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        }
-        else
-        {
-            inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos);
-        }
-    }
-    else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        inputScale  = input_attr->asymm.scale;
-        inputTail   = 0 - input_attr->asymm.zero_point * inputScale;
-    }
-
-    if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        dstFixPointPos = (int8_t)output_attr->dfp.fl;
-        if (dstFixPointPos > 0)
-        {
-            outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos);
-        }
-        else
-        {
-            outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
-        }
-    }
-    else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        outputScale  = 1.0f / output_attr->asymm.scale;
-        outputZP     = (vx_float32)(output_attr->asymm.zero_point);
-    }
 #define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE )    \
        (IN_TYPE | ( OUT_TYPE << 16))

--- a/Show More
+++ b/Show More