Update internal to 1.1.32.1

SHA: 215204 Signed-off-by: Kainan Cha <kainan.zha@verisilicon.com>
2021-08-02 16:19:21 +08:00 · 2021-08-02 16:19:21 +08:00 · 4d4bc08d6a
parent 8fb3a7e6fb
commit 4d4bc08d6a
194 changed files with 12898 additions and 13455 deletions
--- a/src/tim/vx/internal/BUILD
+++ b/src/tim/vx/internal/BUILD
@ -195,14 +195,6 @@ cc_library(
        "src/kernel/vsi_nn_kernel_param.c",
        "src/kernel/vsi_nn_gpu.c",
        "src/kernel/vsi_nn_kernel_gpu_shape_optimize.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c",
        "src/libnnext/vsi_nn_libnnext_resource.c",
        "src/libnnext/vsi_nn_vxkernel.c",
    ] + [":kernel_srcs"]
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -156,3 +156,5 @@ DEF_OP(ERF)
 DEF_OP(ONE_HOT)
 DEF_OP(NMS)
 DEF_OP(GROUPED_CONV1D)
+DEF_OP(SCATTER_ND_UPDATE)
+DEF_OP(GELU)
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h
@ -0,0 +1,37 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GELU_H
+#define _VSI_NN_OP_GELU_H
+
+#include "vsi_nn_types.h"
+
+typedef struct _vsi_nn_gelu_param
+{
+    vsi_bool approximate;
+} vsi_nn_gelu_param;
+
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h
@ -0,0 +1,43 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_SCATTER_ND_UPDATE_H
+#define _VSI_NN_OP_SCATTER_ND_UPDATE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_scatter_nd_update_param
+{
+    vsi_bool use_locking;
+} vsi_nn_scatter_nd_update_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_signalframe.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_signalframe.h
@ -53,7 +53,11 @@ typedef struct _vsi_nn_signalframe_param
    uint32_t window_length;
    uint32_t step;
    uint32_t pad_end;
-    uint32_t pad;
+    union
+    {
+        uint32_t pad;
+        float pad_value;
+    };
    uint32_t axis;
 } vsi_nn_signalframe_param;

--- a/src/tim/vx/internal/include/ops/vsi_nn_op_spatial_transformer.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_spatial_transformer.h
@ -55,8 +55,7 @@ typedef struct _vsi_nn_spatial_transformer_param
    float         theta_2_1;
    float         theta_2_2;
    float         theta_2_3;
-
-    vsi_nn_spatial_transformer_lcl_data lcl;
+    vsi_bool      align_corners;
 } vsi_nn_spatial_transformer_param;

 #ifdef __cplusplus
@ -64,4 +63,3 @@ typedef struct _vsi_nn_spatial_transformer_param
 #endif

 #endif
-
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -63,8 +63,16 @@ typedef struct _vsi_nn_hw_config_t
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
    uint32_t subGroupSize;
 #endif
+    uint32_t use_40bits_va;
 } vsi_nn_hw_config_t;

+typedef struct _vsi_nn_runtime_option_t
+{
+    int32_t enable_shader;
+    int32_t enable_opcheck;
+    int32_t enable_concat_optimize;
+} vsi_nn_runtime_option_t;
+
 /**
 * Ovxlib NN runtime context.
 */
@ -72,6 +80,7 @@ typedef struct _vsi_nn_context_t
 {
    vx_context c;
    vsi_nn_hw_config_t config;
+    vsi_nn_runtime_option_t options;
 } *vsi_nn_context_t;

 /**
--- a/src/tim/vx/internal/include/vsi_nn_internal_node.h
+++ b/src/tim/vx/internal/include/vsi_nn_internal_node.h
@ -87,7 +87,8 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor
    (
    vsi_nn_node_t* node,
    vsi_nn_tensor_attr_t* input_attr,
-    vsi_nn_tensor_attr_t* weight_attr
+    vsi_nn_tensor_attr_t* weight_attr,
+    vsi_bool use_virtual_tensor
    );

 vsi_status vsi_nn_internal_deinit_node
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -170,6 +170,8 @@
 #include "ops/vsi_nn_op_one_hot.h"
 #include "ops/vsi_nn_op_nms.h"
 #include "ops/vsi_nn_op_grouped_conv1d.h"
+#include "ops/vsi_nn_op_scatter_nd_update.h"
+#include "ops/vsi_nn_op_gelu.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"

@ -326,6 +328,8 @@ typedef union _vsi_nn_nn_param
    vsi_nn_one_hot_param            one_hot;
    vsi_nn_nms_param                nms;
    vsi_nn_grouped_conv1d_param     grouped_conv1d;
+    vsi_nn_scatter_nd_update_param  scatter_nd_update;
+    vsi_nn_gelu_param               gelu;
    uint8_t                         client_param[128];

    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@ -721,6 +721,13 @@ vsi_status vsi_nn_SwapHandle
    void ** old_ptr
    );

+vsi_bool vsi_nn_ConvertTensor
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* input,
+    vsi_nn_tensor_t* output
+    );
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{

 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 32
+#define VSI_NN_VERSION_PATCH 33
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
@ -35,7 +35,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vsi_nn_vxkernel.h"
-//#include "libnnext/vx_lib_nnext.h"

 #define _CPU_ARG_NUM            (1)
 #define _CPU_INPUT_NUM          (1)
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
@ -35,7 +35,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vsi_nn_vxkernel.h"
-//#include "libnnext/vx_lib_nnext.h"

 #define _CPU_ARG_NUM            (1)
 #define _CPU_INPUT_NUM          (1)
--- a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -284,4 +283,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( add_mean_std_norm, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/cast_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cast_cl.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -280,4 +279,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( cast, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -223,7 +222,6 @@ static vsi_status _query_kernel
    }

    return status;
-
 } /* _query_kernel() */


@ -303,4 +301,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( clip, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -191,7 +190,6 @@ static vsi_status _query_kernel
    {
        *is_use_u8_kernel = FALSE;
        param_def_size    = _DETECT_POST_BOX_F32_PARAM_NUM;
-
    }

    key = DETECT_POST_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype );
@ -311,4 +309,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( detect_post_box, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS
 #if 0
@ -188,4 +187,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( detect_post_nms, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@ -49,6 +49,8 @@ typedef enum
    UNARY_HSIGMOID,
    UNARY_MISH,
    UNARY_ROUND,
+    UNARY_GELU,
+    UNARY_HGELU
 } unary_type_e;

 /*
@ -94,6 +96,8 @@ typedef enum
 #define HSIGMOID_OPERATION      hard_sigmoid
 #define MISH_OPERATION          mish
 #define ROUND_OPERATION         round
+#define GELU_OPERATION          gelu
+#define HGELU_OPERATION         hard_gelu

 static const struct {
        uint32_t key;
@ -117,6 +121,10 @@ static const struct {
    TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION,     UNARY_MISH,     F16, F16)
    TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
    TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION,    UNARY_ROUND,    F16, F16)
+    TENSOR_UNARY_KERNELS_FLOAT(GELU_OPERATION,     UNARY_GELU,     F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT(GELU_OPERATION,     UNARY_GELU,     F16, F16)
+    TENSOR_UNARY_KERNELS_FLOAT(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT(HGELU_OPERATION,    UNARY_HGELU,    F16, F16)

    TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
    TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F16, F16)
@ -134,6 +142,10 @@ static const struct {
    TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION,     UNARY_MISH,     F16, F16)
    TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
    TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION,    UNARY_ROUND,    F16, F16)
+    TENSOR_UNARY_KERNELS_FLOAT_2D(GELU_OPERATION,     UNARY_GELU,     F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT_2D(GELU_OPERATION,     UNARY_GELU,     F16, F16)
+    TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION,    UNARY_HGELU,    F16, F16)

    TENSOR_UNARY_KERNELS(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
    TENSOR_UNARY_KERNELS(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
@ -143,6 +155,8 @@ static const struct {
    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
    TENSOR_UNARY_KERNELS(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
    TENSOR_UNARY_KERNELS(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
+    TENSOR_UNARY_KERNELS(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
+    TENSOR_UNARY_KERNELS(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)

    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
@ -152,6 +166,8 @@ static const struct {
    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)

    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I32,  I32)

@ -166,6 +182,8 @@ static const struct {
 #undef HSIGMOID_OPERATION
 #undef MISH_OPERATION
 #undef ROUND_OPERATION
+#undef GELU_OPERATION
+#undef HGELU_OPERATION
 /*
 * Kernel params
 */
@ -417,4 +435,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( neg,          UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( mish,         UNARY_MISH )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( round,        UNARY_ROUND )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu,         UNARY_GELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu,    UNARY_HGELU )
 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -210,4 +209,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( grucell_activation, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -210,4 +209,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( grucell_activation_sma, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -331,4 +330,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( l2normalizescale, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -240,4 +239,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( logical_not, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c
@ -307,7 +307,8 @@ static vsi_status _query_kernel
        kernel->info.parameters  = param_def;
        kernel->info.numParams   = (uint32_t)param_size;
        kernel->info.initialize  = initializer;
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
                kernel_map[i].source_name );
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
                kernel_map[i].source_name );
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
@ -0,0 +1,376 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define KERNEL_SOURCE_1    "scatter_nd_update"
+
+#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _coord_dim) \
+    ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_coord_dim))
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(SRC0_TYPE, SRC2_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.scatter_nd_update_"#SRC0_TYPE#SRC2_TYPE"to"#DST_TYPE)
+
+#define TENSOR_SCATTER_ND_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } scatter_nd_update_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_KERNELS(I32, I32, I32, I32,      KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_KERNELS(U32, I32, U32, U32,      KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_KERNELS(F32, I32, F32, F32,      KERNEL_SOURCE_1)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _scatter_nd_update_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _SCATTER_ND_UPDATE_PARAM_NUM          _cnt_of_array(_scatter_nd_update_kernel_param_def)
+
+static vsi_status cal_scatter_nd_update_tensor_reshape_size
+    (
+    vsi_nn_tensor_t ** inputs,
+    int32_t sizes[VSI_NN_MAX_DIM_NUM],
+    uint32_t block_size,
+    uint32_t coordDim,
+    uint32_t* width,
+    uint32_t* area,
+    uint32_t* vol,
+    int32_t* newDim
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    uint32_t dims_num = inputs[0]->attr.dim_num;
+    uint32_t *input_size = inputs[0]->attr.size;
+    uint32_t i = 0;
+    uint32_t elementCnt = 1;
+
+    if (coordDim != 0 && (width == NULL || area == NULL))
+    {
+        return status;
+    }
+
+#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+
+    newDim[0] = 0;
+    for(i = 0; i < dims_num; ++i)
+    {
+        elementCnt *= input_size[i];
+    }
+
+    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    {
+        sizes[i] = 1;
+    }
+
+    if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+    {
+        sizes[0] = block_size;
+        sizes[1] = elementCnt / block_size;
+        status = VSI_SUCCESS;
+        newDim[0] = 2;
+    }
+    else
+    {
+        return status;
+    }
+
+    if (coordDim == 1) // index shape
+    {
+        *width = 0;
+        *area = 0;
+    }
+    else if (coordDim == 2)
+    {
+        *width = input_size[dims_num - 2];
+        *area = 0;
+    }
+    else if (coordDim == 3)
+    {
+        *width = input_size[dims_num - 3];
+        *area = input_size[dims_num - 3] * input_size[dims_num - 2];
+    }
+    else if (coordDim == 4)
+    {
+        *width = input_size[dims_num - 4];
+        *area = input_size[dims_num - 4] * input_size[dims_num - 3];
+        *vol = input_size[dims_num - 4] * input_size[dims_num - 3] * input_size[dims_num - 2];
+    }
+    else if (coordDim == 5)
+    {
+        *width = input_size[dims_num - 5];
+        *area = input_size[dims_num - 5] * input_size[dims_num - 4];
+        *vol = input_size[dims_num - 5] * input_size[dims_num - 4] * input_size[dims_num - 3];
+    }
+#undef VSI_NN_MAX_IMAGE_WIDTH
+
+    return status;
+} /* _get_EltOP_tensor_reshape_size */
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t       block_size  = 0;
+    int32_t       height = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    block_size = attr[0]->shape->data[0];
+    height = attr[0]->shape->data[1];
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = block_size;
+    gpu_param.global_size[1]   = height;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_initializer() */
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t coord_dim
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e input2_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0 );
+
+    for( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ )
+    {
+        if ( scatter_nd_update_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(scatter_nd_update_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_update_map[i].function_name );
+        kernel->info.parameters = _scatter_nd_update_kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( _scatter_nd_update_kernel_param_def );
+        kernel->info.initialize = _scatter_nd_update_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                scatter_nd_update_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
+    int32_t coord_dim  = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
+    int32_t idx_num  = vsi_nn_kernel_param_get_int32( params, "idx_num" );
+    int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
+    uint32_t width = 0, area = 0, vol = 0;
+    int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
+
+    status = cal_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0],
+                    coord_dim, 0, NULL, NULL, NULL, &rs_in_dim);
+    status |= cal_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1],
+                    block_size, 0, NULL, NULL, NULL, &rs_idx_dim);
+    status |= cal_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2],
+                    block_size, coord_dim, &width, &area, &vol, &rs_out_dim);
+    if (status != VSI_SUCCESS)
+    {
+        return NULL;
+    }
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    if (coord_dim == 5)
+    {
+        offset_idx = 1;
+    }
+    if (coord_dim == 4 || coord_dim == 5)
+    {
+        offsetX = vol;
+        offsetY = area;
+        offsetZ = width;
+        offsetW = 1;
+    }
+    else if (coord_dim == 3)
+    {
+        offsetX = area;
+        offsetY = width;
+        offsetZ = 1;
+        offsetW = 0;
+    }
+    else if (coord_dim == 2)
+    {
+        offsetX = width;
+        offsetY = 1;
+        offsetZ = 0;
+        offsetW = 0;
+    }
+    else if (coord_dim == 1)
+    {
+        offsetX = 1;
+        offsetY = 0;
+        offsetZ = 0;
+        offsetW = 0;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, coord_dim );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 0;
+            /* Pass parameters to node. */
+            node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[2], rs_out_dim );
+            node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[0], rs_in_dim );
+            node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t,  shapes[1], rs_idx_dim );
+            node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetX );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetY );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetZ );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetW );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offset_idx );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &idx_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ND_UPDATE_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_tensor_release( &node_params[0] );
+            vsi_nn_kernel_tensor_release( &node_params[1] );
+            vsi_nn_kernel_tensor_release( &node_params[2] );
+            vsi_nn_kernel_tensor_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( scatter_nd_update, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c
@ -0,0 +1,298 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+__BEGIN_DECLS
+
+#define SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        ( ( IN_DTYPE << 8 ) | ( OUT_DTYPE ) )
+#define SIGNAL_FRAME_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+        CVIVANTE_NAMESPACE("cl.signal_frame_"#IN_DTYPE"to"#OUT_DTYPE), \
+        "signal_frame" }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _signal_frame_kernel_map[] =
+{
+    // Register kernel here
+    SIGNAL_FRAME_KERNEL_MAP( F32, F32 ),
+
+    SIGNAL_FRAME_KERNEL_MAP( U8,  U8)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _signal_frame_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _SIGNAL_FRAME_PARAM_NUM  _cnt_of_array( _signal_frame_kernel_param_def )
+#define FRAME_STEP      (2)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_signal_frame_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,         // workdim
+        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0}, // localWorkSize: local group size in thread
+        {0, 0, 0}  // globalWorkSize: image size in thread
+        };
+
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_int_array_t * out_shape = NULL;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    out_shape  = attr[1]->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(attr[0]);
+    SAFE_FREE_TENSOR_ATTR(attr[1]);
+#undef SAFE_FREE_TENSOR_ATTR
+
+    return status;
+} /* _erf_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    in_dtype =  in_dtype  == F16 ? F32 : in_dtype;
+    out_dtype = out_dtype == F16 ? F32 : out_dtype;
+    key = SIGNAL_FRAME_HASH_KEY( in_dtype, out_dtype );
+
+    for ( i = 0; i < _cnt_of_array(_signal_frame_kernel_map); i ++ )
+    {
+        if ( _signal_frame_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(_signal_frame_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _signal_frame_kernel_map[i].function_name );
+        kernel->info.parameters  = _signal_frame_kernel_param_def;
+        kernel->info.numParams   = _cnt_of_array( _signal_frame_kernel_param_def );
+        kernel->info.initialize  = _signal_frame_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                _signal_frame_kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _signal_frame_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_SIGNAL_FRAME_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t frame_length  = vsi_nn_kernel_param_get_int32( params, "frame_length" );
+    int32_t frame_step  = vsi_nn_kernel_param_get_int32( params, "frame_step" );
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t pad_end  = vsi_nn_kernel_param_get_int32( params, "pad_end" );
+    float pad_value  = vsi_nn_kernel_param_get_float32( params, "pad_val" );
+    int32_t num_frames = outputs[0]->attr.size[axis + 1];
+    int32_t rank = inputs[0]->attr.dim_num;
+    int32_t inner = 1;
+    int32_t outer = 1;
+    int32_t length_samples = inputs[0]->attr.size[axis];
+    int32_t i = 0;
+    vsi_nn_tensor_t* rs_tensors[2] = { NULL };
+    int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
+
+    for (i = 0; i < axis; i++)
+    {
+        inner *= inputs[0]->attr.size[i];
+    }
+
+    for (i = axis + 1; i < rank; i++)
+    {
+        outer *= inputs[0]->attr.size[i];
+    }
+
+    shape[0][0] = inner;
+    shape[0][1] = length_samples;
+    shape[0][2] = 1;
+    shape[0][3] = outer;
+
+    shape[1][0] = inner;
+    shape[1][1] = frame_length;
+    shape[1][2] = num_frames;
+    shape[1][3] = outer;
+
+    rs_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], (uint32_t*)shape[0], 4 );
+    rs_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], (uint32_t*)shape[1], 4 );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size,
+                rs_tensors[1]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status )
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            if ( pad_end )
+            {
+                // Set default border mode.
+                vx_border_t border;
+                uint32_t data = 0;
+                uint32_t dsize = 1;
+
+                vsi_nn_Float32ToDtype(pad_value, (uint8_t*)&data, &outputs[0]->attr.dtype);
+                border.mode = VX_BORDER_CONSTANT;
+                dsize = vsi_nn_GetTypeBytes( inputs[0]->attr.dtype.vx_type );
+                if ( dsize == 1 )
+                {
+                    border.constant_value.U8 = (uint8_t)data;
+                }
+                else if ( dsize == 4 )
+                {
+                    border.constant_value.U32 = data;
+                }
+                else
+                {
+                    border.constant_value.U16 = (uint16_t)data;
+                }
+
+                status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            }
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _SIGNAL_FRAME_PARAM_NUM,
+                    rs_tensors, 1, &rs_tensors[1], 1 );
+
+            node_params[FRAME_STEP] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &frame_step );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SIGNAL_FRAME_PARAM_NUM );
+            CHECK_STATUS_FAIL_GOTO( status, final );
+        }
+    }
+
+final:
+    if (rs_tensors[0])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[0] );
+    }
+
+    if (rs_tensors[1])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[1] );
+    }
+
+    if (node_params[FRAME_STEP])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[FRAME_STEP] );
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( signal_frame, _setup )
--- a/src/tim/vx/internal/src/kernel/cl/slice_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/slice_cl.c
@ -22,7 +22,6 @@
 *
 *****************************************************************************/

-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -66,7 +65,6 @@ __BEGIN_DECLS
 {   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
    SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }

-
 typedef struct
 {
    uint32_t key;
@ -221,7 +219,6 @@ static vsi_status _query_kernel
    return status;
 } /* _query_kernel() */

-
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
@ -268,7 +265,7 @@ static vsi_nn_kernel_node_t _setup
    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
        inputs[0]->attr.dim_num ) || input_batch != output_batch )
    {
-        return NULL;
+        goto final;
    }

    image_2d = (rank[0] < 3 || shapes[0][2] == 1);
@ -300,6 +297,13 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
        }
    }
+
+final:
+    for (i = 0; i < _IO_NUM; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c
@ -0,0 +1,535 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (4)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.box_with_nms_limit")
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _box_with_nms_limit_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _BOX_WITH_NMS_LIMIT_PARAM_NUM  _cnt_of_array( _box_with_nms_limit_kernel_param_def )
+#define SCORE_THRESHOLD         (7)
+#define MAX_NUM_DETECTIONS      (8)
+#define NMS_KERNEL_METHOD       (9)
+#define IOU_THRESHOLD           (10)
+#define SIGMA                   (11)
+#define NMS_SCORE_THRESHOLD     (12)
+
+static float hard_nms_kernel
+    (
+    float iou,
+    float iouThreshold
+    )
+{
+    return iou < iouThreshold ? 1.0f : 0.0f;
+}
+
+static float linear_nms_kernel
+    (
+    float iou,
+    float iouThreshold
+    )
+{
+    return iou < iouThreshold ? 1.0f : 1.0f - iou;
+}
+
+static float gaussian_nms_kernel
+    (
+    float iou,
+    float sigma
+    )
+{
+    return (float)(exp(-1.0f * iou * iou / sigma));
+}
+
+void swap_element
+    (
+    uint32_t* list,
+    uint32_t first,
+    uint32_t second
+    )
+{
+    uint32_t temp = list[first];
+    list[first] = list[second];
+    list[second] = temp;
+}
+
+uint32_t max_element
+    (
+    float* data,
+    uint32_t* index_list,
+    uint32_t len
+    )
+{
+    uint32_t i;
+    uint32_t max_index = 0;
+    float max_val = data[index_list[0]];
+    for(i = 1; i < len; i++)
+    {
+        float val = data[index_list[i]];
+        if (max_val < val)
+        {
+            max_val = val;
+            max_index = i;
+        }
+    }
+    return max_index;
+}
+
+static uint32_t max_comp_func
+    (
+    void* data,
+    int32_t left,
+    int32_t right
+    )
+{
+    float* fdata = (float*)data;
+    return fdata[left] >= fdata[right];
+}
+
+void sort_element_by_score
+    (
+    float* data,
+    uint32_t* index_list,
+    uint32_t len
+    )
+{
+    vsi_nn_partition(data, 0, len - 1, max_comp_func, TRUE, index_list);
+}
+
+typedef struct
+{
+    float* fdata;
+    uint32_t numClasses;
+} class_comp_param;
+
+static uint32_t class_comp_func
+    (
+    void* data,
+    int32_t left,
+    int32_t right
+    )
+{
+    class_comp_param *p = (class_comp_param*)data;
+    float* fdata = p->fdata;
+    uint32_t numClasses = p->numClasses;
+    uint32_t lhsClass = left % numClasses, rhsClass = right % numClasses;
+    return lhsClass == rhsClass ? fdata[left] > fdata[right]
+                : lhsClass < rhsClass;
+}
+
+static void sort_element_by_class
+    (
+    float* data,
+    uint32_t* index_list,
+    uint32_t len,
+    uint32_t numClasses
+    )
+{
+    class_comp_param class_comp;
+    class_comp.fdata = data;
+    class_comp.numClasses = numClasses;
+    vsi_nn_partition(&class_comp, 0, len - 1, class_comp_func, TRUE, index_list);
+}
+
+// Taking two indices of bounding boxes, return the intersection-of-union.
+float getIoUAxisAligned
+    (
+    const float* roi1,
+    const float* roi2
+    )
+{
+    const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]);
+    const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]);
+    const float x1 = vsi_nn_max(roi1[0], roi2[0]);
+    const float x2 = vsi_nn_min(roi1[2], roi2[2]);
+    const float y1 = vsi_nn_max(roi1[1], roi2[1]);
+    const float y2 = vsi_nn_min(roi1[3], roi2[3]);
+    const float w = vsi_nn_max(x2 - x1, 0.0f);
+    const float h = vsi_nn_max(y2 - y1, 0.0f);
+    const float areaIntersect = w * h;
+    const float areaUnion = area1 + area2 - areaIntersect;
+    return areaIntersect / areaUnion;
+}
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    int32_t* int32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    int32_t* int32_out_buffer[_OUTPUT_NUM] = {0};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t  i = 0;
+    float score_threshold = 0;
+    int32_t max_num_detections = 0;
+    int32_t nms_kernel_method = 0;
+    float iou_threshold = 0;
+    float sigma = 0;
+    float nms_score_threshold = 0;
+    uint32_t j = 0, n = 0, b = 0, c = 0;
+    const uint32_t kRoiDim = 4;
+    uint32_t numRois = 0;
+    uint32_t numClasses = 0;
+    int32_t ind = 0;
+    uint32_t * batch_data = NULL;
+    int32_t numBatch = 0;
+    uint32_t * select = NULL;
+    uint32_t select_size = 0;
+    uint32_t scores_index = 0;
+    uint32_t roi_index = 0;
+    uint32_t roi_out_index = 0;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        if (i == 2)
+        {
+            int32_in_buffer[i] = (int32_t*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+            CHECK_PTR_FAIL_GOTO( int32_in_buffer[i], "Create input buffer fail.", final );
+        }
+        else
+        {
+            f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+            CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
+        }
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        if (i < 2)
+        {
+            f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+            CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+            memset( f32_out_buffer[i], 0, out_bytes[i] );
+        }
+        else
+        {
+            int32_out_buffer[i] = (int32_t *)malloc( out_bytes[i] );
+            CHECK_PTR_FAIL_GOTO( int32_out_buffer[i], "Create output buffer fail.", final );
+            memset( int32_out_buffer[i], 0, out_bytes[i] );
+        }
+    }
+
+#define VSI_NN_KERNEL_READ_SCALAR(type, idx, pointer) \
+    vsi_nn_kernel_scalar_read_##type((vsi_nn_kernel_scalar_t)param[idx], pointer)
+
+    status   = VSI_NN_KERNEL_READ_SCALAR(float32, SCORE_THRESHOLD, &score_threshold);
+    status  |= VSI_NN_KERNEL_READ_SCALAR(int32, MAX_NUM_DETECTIONS, &max_num_detections);
+    status  |= VSI_NN_KERNEL_READ_SCALAR(int32, NMS_KERNEL_METHOD, &nms_kernel_method);
+    status  |= VSI_NN_KERNEL_READ_SCALAR(float32, IOU_THRESHOLD, &iou_threshold);
+    status  |= VSI_NN_KERNEL_READ_SCALAR(float32, SIGMA, &sigma);
+    status  |= VSI_NN_KERNEL_READ_SCALAR(float32, NMS_SCORE_THRESHOLD, &nms_score_threshold);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+#undef VSI_NN_KERNEL_READ_SCALAR
+
+    numRois = in_attr[0]->shape->data[1];
+    numClasses = in_attr[0]->shape->data[0];
+
+    batch_data = (uint32_t*)malloc(numRois * sizeof(uint32_t));
+    CHECK_PTR_FAIL_GOTO( batch_data, "Create batch_data fail.", final );
+    memset(batch_data, 0, numRois * sizeof(uint32_t));
+
+    for (i = 0, ind = -1; i < numRois; i++)
+    {
+        if (int32_in_buffer[2][i] != ind)
+        {
+            ind = int32_in_buffer[2][i];
+            numBatch++;
+        }
+        batch_data[numBatch - 1]++;
+    }
+    select = (uint32_t*)malloc(numBatch * numRois
+        * numClasses * sizeof(uint32_t));
+    CHECK_PTR_FAIL_GOTO( select, "Create select fail.", final );
+    memset(select, 0, numBatch * numRois * numClasses * sizeof(uint32_t));
+    for (n = 0; n < (uint32_t)numBatch; n++)
+    {
+        int32_t numDetections_batch = 0;
+        uint32_t select_start_batch = select_size;
+        uint32_t select_len = 0;
+        // Exclude class 0 (background)
+        for (c = 1; c < numClasses; c++)
+        {
+            uint32_t select_start = select_size;
+            int32_t maxNumDetections0 = max_num_detections;
+            uint32_t numDetections = 0;
+            for (b = 0; b < batch_data[n]; b++)
+            {
+                uint32_t index = b * numClasses + c;
+                float score = f32_in_buffer[0][scores_index + index];
+                if (score > score_threshold) {
+                    select[select_size] = index;
+                    select_size++;
+                }
+            }
+            select_len = select_size - select_start;
+
+            if (maxNumDetections0 < 0)
+            {
+                maxNumDetections0 = select_len;
+            }
+
+            for (j = 0; (j < select_len && numDetections < (uint32_t)maxNumDetections0); j++)
+            {
+                // find max score and swap to the front.
+                int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]),
+                    &(select[select_start + j]), select_len - j) + j;
+
+                swap_element(&(select[select_start]), max_index, j);
+
+                // Calculate IoU of the rest, swap to the end (disgard) if needed.
+                for (i = j + 1; i < select_len; i++)
+                {
+                    int32_t roiBase0 = roi_index + select[select_start + i] * kRoiDim;
+                    int32_t roiBase1 = roi_index + select[select_start + j] * kRoiDim;
+                    float iou = getIoUAxisAligned(&(f32_in_buffer[1][roiBase0]),
+                        &(f32_in_buffer[1][roiBase1]));
+                    float kernel_iou;
+                    if (nms_kernel_method == 0)
+                    {
+                        kernel_iou = hard_nms_kernel(iou, iou_threshold);
+                    }
+                    else if (nms_kernel_method == 1)
+                    {
+                        kernel_iou = linear_nms_kernel(iou, iou_threshold);
+                    }
+                    else
+                    {
+                        kernel_iou = gaussian_nms_kernel(iou, sigma);
+                    }
+                    f32_in_buffer[0][scores_index + select[select_start + i]] *= kernel_iou;
+                    if (f32_in_buffer[0][scores_index + select[select_start + i]] < nms_score_threshold)
+                    {
+                        swap_element(&(select[select_start]), i, select_len - 1);
+                        i--;
+                        select_len--;
+                    }
+                }
+                numDetections++;
+            }
+            select_size = select_start + select_len;
+            numDetections_batch += numDetections;
+        }
+
+        // Take top max_num_detections.
+        sort_element_by_score(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
+            numDetections_batch);
+
+        if (numDetections_batch > max_num_detections && max_num_detections >= 0)
+        {
+            select_size = select_start_batch + max_num_detections;
+        }
+        select_len = select_size - select_start_batch;
+        // Sort again by class.
+        sort_element_by_class(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
+            select_len, numClasses);
+
+        for (i = 0; i < select_len; i++)
+        {
+            int32_t in_index0 = scores_index + select[select_start_batch + i];
+            int32_t in_index1 = roi_index + select[select_start_batch + i] * kRoiDim;
+            f32_out_buffer[0][roi_out_index] = f32_in_buffer[0][in_index0];
+            memcpy(&(f32_out_buffer[1][roi_out_index * kRoiDim]),
+                &f32_in_buffer[1][in_index1], kRoiDim * sizeof(float));
+            int32_out_buffer[2][roi_out_index] = select[select_start_batch + i] % numClasses;
+            int32_out_buffer[3][roi_out_index] = n;
+            roi_out_index++;
+        }
+
+        scores_index += batch_data[n] * numClasses;
+        roi_index += batch_data[n] * numClasses * kRoiDim;
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (i < 2)
+        {
+            status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        }
+        else
+        {
+            status = vsi_nn_kernel_tensor_write( output[i], out_attr[i],
+                int32_out_buffer[i], out_bytes[i] );
+        }
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+final:
+    vsi_nn_safe_free(batch_data);
+    vsi_nn_safe_free(select);
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        vsi_nn_safe_free(f32_in_buffer[i]);
+        vsi_nn_safe_free(int32_in_buffer[i]);
+
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        vsi_nn_safe_free(f32_out_buffer[i]);
+        vsi_nn_safe_free(int32_out_buffer[i]);
+
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _box_with_nms_limit_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _box_with_nms_limit_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_BOX_WITH_NMS_LIMIT_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    float score_threshold  = vsi_nn_kernel_param_get_float32( params, "score_threshold" );
+    int32_t max_num_detections  = vsi_nn_kernel_param_get_int32( params, "max_num_detections" );
+    int32_t nms_kernel_method  = vsi_nn_kernel_param_get_int32( params, "nms_kernel_method" );
+    float iou_threshold  = vsi_nn_kernel_param_get_float32( params, "iou_threshold" );
+    float sigma  = vsi_nn_kernel_param_get_float32( params, "sigma" );
+    float nms_score_threshold  = vsi_nn_kernel_param_get_float32( params, "nms_score_threshold" );
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status )
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _BOX_WITH_NMS_LIMIT_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCORE_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &score_threshold );
+            node_params[MAX_NUM_DETECTIONS] = vsi_nn_kernel_scalar_create( graph, I32, &max_num_detections );
+            node_params[NMS_KERNEL_METHOD] = vsi_nn_kernel_scalar_create( graph, I32, &nms_kernel_method );
+            node_params[IOU_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &iou_threshold );
+            node_params[SIGMA] = vsi_nn_kernel_scalar_create( graph, F32, &sigma );
+            node_params[NMS_SCORE_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &nms_score_threshold );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _BOX_WITH_NMS_LIMIT_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCORE_THRESHOLD] );
+            vsi_nn_kernel_scalar_release( &node_params[MAX_NUM_DETECTIONS] );
+            vsi_nn_kernel_scalar_release( &node_params[NMS_KERNEL_METHOD] );
+            vsi_nn_kernel_scalar_release( &node_params[IOU_THRESHOLD] );
+            vsi_nn_kernel_scalar_release( &node_params[SIGMA] );
+            vsi_nn_kernel_scalar_release( &node_params[NMS_SCORE_THRESHOLD] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( box_with_nms_limit, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
@ -47,6 +47,8 @@ typedef enum
    UNARY_HSIGMOID,
    UNARY_MISH,
    UNARY_ROUND,
+    UNARY_GELU,
+    UNARY_HGELU,
 } unary_type_e;


@ -109,6 +111,58 @@ static float round_eval(float data)
    return data;
 }

+static float erf_eval(float x)
+{
+    float res = 0;
+    float tmp = x;
+    float factorial = 1; /*n!*/
+    float x_pow = x;
+    int32_t one = 1;
+    int32_t n = 1;
+
+    if (x <= -3)
+    {
+        return -1;
+    }
+    else if (x >= 3)
+    {
+        return 1;
+    }
+
+    while (vsi_abs(tmp) > 1e-5)
+    {
+        res += tmp;
+
+        factorial *= n;
+        one *= -1;
+        x_pow *= x * x;
+        tmp = one / factorial * x_pow / ( 2 * n + 1);
+
+        n ++;
+    }
+#define VSI_MUL2_RSQRTPI    (1.1283791670955126f)
+
+    res *= VSI_MUL2_RSQRTPI;
+
+    return res;
+}
+
+static float gelu_eval(float data)
+{
+    data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f))));
+
+    return data;
+}
+
+#define VSI_SQRT_2_RCP_PI  0.7978845834732056f
+static float hgelu_eval(float data)
+{
+    float cdf = (float)(0.5f * (1.0f + tanh((VSI_SQRT_2_RCP_PI *
+        (data + 0.044715f * data * data * data)))));
+
+    return data * cdf;
+}
+
 DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
    (
    vsi_nn_kernel_node_t node,
@ -176,6 +230,12 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
        case UNARY_ROUND:
            data = round_eval(data);
            break;
+        case UNARY_GELU:
+            data = gelu_eval(data);
+            break;
+        case UNARY_HGELU:
+            data = hgelu_eval(data);
+            break;
        default:
            break;
        }
@ -309,4 +369,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu,          UNARY_ELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( neg,          UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( mish,         UNARY_MISH )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( round,        UNARY_ROUND )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( round,        UNARY_ROUND )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( gelu,         UNARY_GELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu,    UNARY_HGELU )
--- a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
@ -101,11 +101,11 @@ DEF_KERNEL_EXECUTOR(_compute)
        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
        memset( f32_out_buffer[i], 0, out_bytes[i] );
    }
-#define ERF_PI  3.141592653589793
+#define VSI_ERF_PI  3.141592653589793
    for (i = 0; i < out_elements[0]; i ++)
    {
        /* 2 / sqrt(pi) * (sum[(-1)^n! * x ^ (2n + 1)] + x) */
-        float x = f32_in_buffer[0][i];
+        float x = vsi_clamp(f32_in_buffer[0][i], -2, 2);
        float res = 0;
        float tmp = x;
        float factorial = 1; /*n!*/
@ -126,7 +126,7 @@ DEF_KERNEL_EXECUTOR(_compute)
        }


-        res *= 2.0f / (float)sqrt(ERF_PI);
+        res *= 2.0f / (float)sqrt(VSI_ERF_PI);

        f32_out_buffer[0][i] = res;
    }
--- a/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c
@ -0,0 +1,188 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.extra_ending")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _extra_ending_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _EXTRA_ENDING_PARAM_NUM  _cnt_of_array( _extra_ending_kernel_param_def )
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    uint8_t *u8_in_buffer[_INPUT_NUM] = {NULL};
+    uint8_t *u8_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    int32_t i = 0;
+
+    /* prepare data */
+    input[1] = (vsi_nn_kernel_tensor_t)param[1];
+    in_attr[1] = vsi_nn_kernel_tensor_attr_create( input[1] );
+    u8_in_buffer[1] = (uint8_t*)vsi_nn_kernel_tensor_create_buffer( input[1], in_attr[1], FALSE );
+    CHECK_PTR_FAIL_GOTO( u8_in_buffer[i], "Create input buffer fail.", final );
+
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(uint8_t);
+        u8_out_buffer[i] = (uint8_t *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( u8_out_buffer[i], "Create output buffer fail.", final );
+        memset( u8_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    memcpy(u8_out_buffer[0], u8_in_buffer[1], out_bytes[0]);
+
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        status = vsi_nn_kernel_tensor_write( output[i], out_attr[i],
+            u8_out_buffer[i], out_bytes[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        vsi_nn_safe_free(u8_in_buffer[i]);
+
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        vsi_nn_safe_free(u8_out_buffer[i]);
+
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _extra_ending_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _extra_ending_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_EXTRA_ENDING_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _EXTRA_ENDING_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _EXTRA_ENDING_PARAM_NUM );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( extra_ending, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c
@ -0,0 +1,323 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <float.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (2)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.heatmap_max_keypoint")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _heatmap_max_keypoint_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _HEATMAP_MAX_KEYPOINT_PARAM_NUM  _cnt_of_array( _heatmap_max_keypoint_kernel_param_def )
+
+// This function uses Taylor expansion up to the quatratic term to approximate bicubic
+// upscaling result.
+// 2nd order Taylor expansion: D(x) = D - b'x + 1/2 * x'Ax
+// where D = grid[1][1], Taylor expansion center, the original score,
+//       x = delta, the correction on max keypoint position,
+//       D(x) = deltaScore, the accuracy score after correction
+static void _solve_for_delta
+    (
+    const float grid[3][3],
+    float* delta,
+    float* deltaScore,
+    float fpAtol,
+    float fpRtol
+    )
+{
+    // b: negative 1st order derivative at center
+    // A: Hessian matrix at center (2nd order derivative)
+    float A[2][2], b[2];
+    float crossProd1, crossProd2;
+    float detA;
+    b[0] = -(grid[1][2] - grid[1][0]) / 2.0f;
+    b[1] = -(grid[2][1] - grid[0][1]) / 2.0f;
+    A[0][0] = grid[1][0] - 2.0f * grid[1][1] + grid[1][2];
+    A[0][1] = (grid[2][2] - grid[2][0] - grid[0][2] + grid[0][0]) / 4.0f;
+    A[1][0] = A[0][1];
+    A[1][1] = grid[0][1] - 2.0f * grid[1][1] + grid[2][1];
+
+    // solve Ax=b, where x=delta -> delta = inv(A) * b
+    crossProd1 = A[0][0] * A[1][1];
+    crossProd2 = A[0][1] * A[1][0];
+    detA = crossProd1 - crossProd2;
+    // check if A is invertible
+    if (fabs(detA) < (fpAtol + fpRtol * crossProd1)) return;
+    delta[0] = (A[1][1] * b[0] - A[0][1] * b[1]) / detA;
+    delta[1] = (A[0][0] * b[1] - A[1][0] * b[0]) / detA;
+
+    // clip out of range delta, i.e. delta > 3/2
+    if (fabs(delta[0]) > 1.5f || fabs(delta[1]) > 1.5f)
+    {
+        float scale = (float)(1.5f / vsi_nn_max(fabs(delta[0]), fabs(delta[1])));
+        delta[0] *= scale;
+        delta[1] *= scale;
+    }
+
+    *deltaScore = grid[1][1] - b[0] * delta[0] - b[1] * delta[1] +
+                  ((A[0][0] * delta[0] + A[0][1] * delta[1]) * delta[0] +
+                   (A[1][0] * delta[0] + A[1][1] * delta[1]) * delta[1]) /
+                          2.0f;
+}
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t  i = 0;
+    uint32_t j = 0;
+    uint32_t k = 0;
+    uint32_t numBoxes = 0;
+    uint32_t heatmapSize = 0;
+    uint32_t numKeypoints = 0;
+    uint32_t boxInfoLength = 4;
+    uint32_t output_score_index = 0;
+    uint32_t output_keypoint_index = 0;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    numBoxes = in_attr[0]->shape->data[3];
+    heatmapSize = in_attr[0]->shape->data[2];
+    numKeypoints = in_attr[0]->shape->data[0];
+
+    for(i = 0; i < numBoxes; i++)
+    {
+        for (j = 0; j < numKeypoints; j++)
+        {
+            uint32_t maxIndex = 0;
+            float maxScore = -FLT_MAX;
+            uint32_t maxIndexWidth;
+            uint32_t maxIndexHeight;
+            float localGrid[3][3] = {{0}};
+            int32_t dh, dw;
+            float delta[2] = {0.0f, 0.0f}, deltaScore;
+            float wRoiStart = f32_in_buffer[1][i * boxInfoLength];
+            float hRoiStart = f32_in_buffer[1][i * boxInfoLength + 1];
+            float wRoiEnd = f32_in_buffer[1][i * boxInfoLength + 2];
+            float hRoiEnd = f32_in_buffer[1][i * boxInfoLength + 3];
+            float roiWidth = wRoiEnd - wRoiStart;
+            float roiHeight = hRoiEnd - hRoiStart;
+            float wRelativePos;
+            float hRelativePos;
+            for (k = 0; k < heatmapSize * heatmapSize; k++)
+            {
+                uint32_t index = i * heatmapSize * heatmapSize * numKeypoints
+                    + k * numKeypoints + j;
+                float val = f32_in_buffer[0][index];
+                if (maxScore < val)
+                {
+                    maxScore = val;
+                    maxIndex = k;
+                }
+            }
+            maxIndexWidth = maxIndex % heatmapSize;
+            maxIndexHeight = maxIndex / heatmapSize;
+
+            // get local 3x3 grid
+            for (dh = -1; dh <= 1; dh++)
+            {
+                for (dw = -1; dw <= 1; dw++)
+                {
+                    // cast uint32_t to int32_t
+                    int32_t h = (int32_t)(maxIndexHeight) + dh;
+                    int32_t w = (int32_t)(maxIndexWidth) + dw;
+                    uint32_t heatmapIndex;
+
+                    // use mirroring for out of bound indexing
+                    // need to ensure heatmapSize >= 2
+                    h = h < 0 ? 1 : (h >= (int32_t)heatmapSize ? heatmapSize - 2 : h);
+                    w = w < 0 ? 1 : (w >= (int32_t)heatmapSize ? heatmapSize - 2 : w);
+
+                    heatmapIndex = i * heatmapSize * heatmapSize * numKeypoints +
+                        (uint32_t)(h) * heatmapSize * numKeypoints +
+                        (uint32_t)(w) * numKeypoints + j;
+                    localGrid[dh + 1][dw + 1] = f32_in_buffer[0][heatmapIndex];
+                }
+            }
+            deltaScore = maxScore;
+            _solve_for_delta((const float (*)[3])localGrid, delta, &deltaScore, 1e-3f, 1e-3f);
+
+            wRelativePos = ((float)(maxIndexWidth) + delta[0] + 0.5f) /
+                (float)(heatmapSize);
+            hRelativePos = ((float)(maxIndexHeight) + delta[1] + 0.5f) /
+                (float)(heatmapSize);
+            f32_out_buffer[0][output_score_index] = deltaScore;
+            f32_out_buffer[1][output_keypoint_index] = wRelativePos * roiWidth + wRoiStart;
+            f32_out_buffer[1][output_keypoint_index + 1] = hRelativePos * roiHeight + hRoiStart;
+            output_score_index++;
+            output_keypoint_index += 2;
+        }
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+            f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        vsi_nn_safe_free(f32_in_buffer[i]);
+
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        vsi_nn_safe_free(f32_out_buffer[i]);
+
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _heatmap_max_keypoint_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _heatmap_max_keypoint_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_HEATMAP_MAX_KEYPOINT_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _HEATMAP_MAX_KEYPOINT_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _HEATMAP_MAX_KEYPOINT_PARAM_NUM );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( heatmap_max_keypoint, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c
@ -0,0 +1,285 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (3)
+#define _CPU_INPUT_NUM          (3)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.scatter_nd_update")
+
+DEF_KERNEL_EXECUTOR(_scatter_nd_update_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    uint32_t *   para_buffer[1] = { NULL };
+    uint32_t *   mask = NULL;
+    float * buffer[3] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL };
+    int32_t i = 0, j = 0;
+    int32_t block_size = 1, indices_num = 1;
+    int32_t coord_dim = 1;
+    int32_t mask_len = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0]; // ref
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1]; // idx    int
+    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2]; // update
+    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3]; // output
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
+    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    para_buffer[0] = (uint32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
+    CHECK_PTR_FAIL_GOTO( para_buffer[0], "Create input1 buffer fail.", final );
+
+    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input2 buffer fail.", final );
+
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(block_size));
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &(coord_dim));
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &(indices_num));
+
+    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
+    memcpy( buffer[2], buffer[0], out_elements * sizeof(float) );
+
+    mask_len = (int32_t)out_elements / block_size;
+    mask = (uint32_t *)malloc( mask_len * sizeof(uint32_t) );
+    memset(mask, 0, mask_len * sizeof(uint32_t));
+
+    if (coord_dim <= 5)
+    {
+        int32_t stride[5] = {0, 0, 0, 0, 0};
+        int32_t new_shape[5] = {1, 1, 1, 1, 1};
+        int32_t merge_dim = (int32_t)attr[3]->shape->size - coord_dim + 1;
+
+        for(i = 0; i < merge_dim; ++i)
+        {
+            new_shape[0] *= attr[3]->shape->data[i];
+        }
+        stride[0] = new_shape[0] / block_size;
+
+        for(i = 1; i < coord_dim; ++i)
+        {
+            new_shape[i] = attr[3]->shape->data[merge_dim + i - 1];
+
+            stride[i] = stride[i - 1] * new_shape[i];
+        }
+
+        for(i = 0; i < indices_num; i++)
+        {
+            uint32_t in_index = i * block_size;
+            uint32_t out_index = 0;
+            uint32_t coord[5] = {0};
+            int32_t byd_flg = 0;
+            int32_t  mask_idx = 0;
+
+            for(j = 0; j < coord_dim; j++)
+            {
+                coord[j] = para_buffer[0][i * coord_dim + coord_dim - j - 1];
+                if (coord[j] >= (uint32_t)new_shape[j])
+                {
+                    byd_flg = 1;
+                    break;
+                }
+            }
+            if (byd_flg)
+            {
+                continue;
+            }
+
+            mask_idx = coord[4] * stride[3] + coord[3] * stride[2] +
+                            coord[2] * stride[1] + coord[1] * stride[0] + coord[0];
+            out_index = mask_idx * block_size;
+            if (mask[mask_idx] == 0)
+            {
+                memset(buffer[2] + out_index, 0, block_size * sizeof(float));
+                mask[mask_idx] = 1;
+            }
+            for(j = 0; j < block_size; j++)
+            {
+                buffer[2][out_index + j] += buffer[1][in_index + j];
+            }
+        }
+    }
+    else
+    {
+        status = VSI_FAILURE;
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
+            buffer[2], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    if ( para_buffer[0] )
+    {
+        free( para_buffer[0] );
+    }
+
+    if (mask)
+    {
+        free(mask);
+    }
+    for( i = 0; i < 3; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for( i = 0; i < 4; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+    return status;
+} /* _scatter_nd_update_exec() */
+/*
+ * Kernel params
+ */
+static vx_param_description_t _scatter_nd_update_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    _scatter_nd_update_exec,
+    _scatter_nd_update_kernel_param_def,
+    _cnt_of_array( _scatter_nd_update_kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    vsi_nn_KernelInitializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
+    int32_t coord_dim  = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
+    int32_t idx_num  = vsi_nn_kernel_param_get_int32( params, "idx_num" );
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 4;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
+                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &idx_num );
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            CHECK_STATUS( status );
+            vsi_nn_kernel_scalar_release( &backend_params[4] );
+            vsi_nn_kernel_scalar_release( &backend_params[5] );
+            vsi_nn_kernel_scalar_release( &backend_params[6] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( scatter_nd_update, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c
@ -0,0 +1,289 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.signal_frame")
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _signal_frame_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _SIGNAL_FRAME_PARAM_NUM  _cnt_of_array( _signal_frame_kernel_param_def )
+#define FRAME_LENGHT    (2)
+#define FRAME_STEP      (3)
+#define AXIS            (4)
+#define PAD_END         (5)
+#define PAD_VAL         (6)
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    int32_t i = 0;
+    int32_t j = 0;
+    int32_t k = 0;
+    int32_t frame_length = 0;
+    int32_t frame_step = 0;
+    int32_t axis = 0;
+    int32_t pad_end = 0;
+    int32_t length_samples = 0;
+    int32_t num_frames = 0;
+    int32_t inner_dim = 1;
+    int32_t outer_dim = 1;
+    int32_t inner_size = 1;
+    float pad_val = 0;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
+    }
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[FRAME_LENGHT], &frame_length);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[FRAME_STEP], &frame_step);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[AXIS], &axis);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[PAD_END], &pad_end);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[PAD_VAL], &pad_val);
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    for (i = 0; i < axis; i++)
+    {
+        inner_dim *= in_attr[0]->shape->data[i];
+    }
+    length_samples = in_attr[0]->shape->data[axis];
+    for (i = axis + 1; i < (int32_t)in_attr[0]->shape->size; i++)
+    {
+        outer_dim *= in_attr[0]->shape->data[i];
+    }
+
+    for (i = 0; i < axis + 1; i++)
+    {
+        inner_size *= out_attr[0]->shape->data[i];
+    }
+
+    num_frames = (length_samples + frame_step - 1) / frame_step;
+    num_frames = pad_end ? num_frames : (length_samples - frame_length) / frame_step + 1;
+
+    for (i = 0; i < outer_dim; i++)
+    {
+        float * src_ptr = f32_in_buffer[0] + i * length_samples * inner_dim;
+        float * dst_ptr = f32_out_buffer[0] + i * num_frames * frame_length * inner_dim;
+
+        for (j = 0; j < num_frames; j++)
+        {
+            for (k = 0; k < frame_length; k++)
+            {
+                int32_t m = j * frame_step + k;
+
+                if (pad_end)
+                {
+                    if (m >= length_samples)
+                    {
+                        int32_t l = 0;
+                        for (l = 0; l < inner_dim; l++)
+                        {
+                            (dst_ptr + (j * frame_length + k) * inner_dim)[l] = pad_val;
+                        }
+                    }
+                    else
+                    {
+                        memcpy(dst_ptr + (j * frame_length + k) * inner_dim, src_ptr + m * inner_dim,
+                            inner_dim * sizeof(float));
+                    }
+                }
+                else
+                {
+                    memcpy(dst_ptr + (j * frame_length + k) * inner_dim, src_ptr + m * inner_dim,
+                        inner_dim * sizeof(float));
+                }
+            }
+        }
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _signal_frame_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _signal_frame_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_SIGNAL_FRAME_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t frame_length  = vsi_nn_kernel_param_get_int32( params, "frame_length" );
+    int32_t frame_step  = vsi_nn_kernel_param_get_int32( params, "frame_step" );
+    int32_t axis  = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t pad_end  = vsi_nn_kernel_param_get_int32( params, "pad_end" );
+    float pad_val  = vsi_nn_kernel_param_get_float32( params, "pad_val" );
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _SIGNAL_FRAME_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[FRAME_LENGHT] = vsi_nn_kernel_scalar_create( graph, I32, &frame_length );
+            node_params[FRAME_STEP] = vsi_nn_kernel_scalar_create( graph, I32, &frame_step );
+            node_params[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+            node_params[PAD_END] = vsi_nn_kernel_scalar_create( graph, I32, &pad_end );
+            node_params[PAD_VAL] = vsi_nn_kernel_scalar_create( graph, F32, &pad_val );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SIGNAL_FRAME_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[FRAME_LENGHT] );
+            vsi_nn_kernel_scalar_release( &node_params[FRAME_STEP] );
+            vsi_nn_kernel_scalar_release( &node_params[AXIS] );
+            vsi_nn_kernel_scalar_release( &node_params[PAD_END] );
+            vsi_nn_kernel_scalar_release( &node_params[PAD_VAL] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( signal_frame, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c
@ -0,0 +1,389 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.spatial_transformer")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _spatial_transformer_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _SPATIAL_TRANSFORMER_PARAM_NUM  _cnt_of_array( _spatial_transformer_kernel_param_def )
+#define HAS_THETA_1_1   (3)
+#define HAS_THETA_1_2   (4)
+#define HAS_THETA_1_3   (5)
+#define HAS_THETA_2_1   (6)
+#define HAS_THETA_2_2   (7)
+#define HAS_THETA_2_3   (8)
+#define THETA_1_1       (9)
+#define THETA_1_2       (10)
+#define THETA_1_3       (11)
+#define THETA_2_1       (12)
+#define THETA_2_2       (13)
+#define THETA_2_3       (14)
+#define ALIGN_CORNERS   (15)
+
+static void _transform_affine(int32_t dst_x, int32_t dst_y, const float m[], float *src_x, float *src_y)
+{
+    *src_x = dst_x * m[0] + dst_y * m[2] + m[4];
+    *src_y = dst_x * m[1] + dst_y * m[3] + m[5];
+}
+
+static float _read_pixel(float *base, vsi_nn_kernel_tensor_attr_t *attr,
+                          float x, float y, int32_t z, int32_t b)
+{
+    vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= attr->shape->data[0] || y >= attr->shape->data[1]);
+    int32_t bx, by;
+    int32_t offset = (b * attr->shape->data[2] + z) * attr->shape->data[0] * attr->shape->data[1];
+    float pixel = 0;
+
+    if (out_of_bounds)
+    {
+        return 0;
+    }
+    // bounded x/y
+    bx = (int32_t)x;
+    by = (int32_t)y;
+
+    pixel = base[attr->shape->data[0] * by + bx + offset];
+
+    return pixel;
+}
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    int32_t  i = 0;
+    int32_t  b = 0;
+    int32_t  c = 0;
+    int32_t  j = 0;
+    int32_t  x = 0;
+    int32_t  y = 0;
+    int32_t  has_theta[6] = {0};
+    int32_t  batch = 1;
+    int32_t  depth = 1;
+    int32_t  height = 1;
+    int32_t  width = 1;
+    int32_t  input_height = 1;
+    int32_t  input_width = 1;
+    int32_t  rank = 0;
+    int32_t  index = 0;
+    int32_t  align_corners = 0;
+    float    theta[6] = {0};
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
+    }
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_1], &has_theta[0]);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_2], &has_theta[1]);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_3], &has_theta[2]);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_1], &has_theta[3]);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_2], &has_theta[4]);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_3], &has_theta[5]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_1], &theta[0]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_2], &theta[1]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_3], &theta[2]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_1], &theta[3]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_2], &theta[4]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_3], &theta[5]);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[ALIGN_CORNERS], &align_corners);
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    rank = (int32_t)out_attr[0]->shape->size;
+    width = out_attr[0]->shape->data[0];
+    height = out_attr[0]->shape->data[1];
+    depth = rank > 2 ? out_attr[0]->shape->data[2] : 1;
+    batch = rank > 3 ? out_attr[0]->shape->data[3] : 1;
+
+    input_width = in_attr[0]->shape->data[0];
+    input_height = in_attr[0]->shape->data[1];
+
+    for (b = 0; b < batch; b++)
+    {
+        float _w = (float)input_width;
+        float _h = (float)input_height;
+        float w = (float)width;
+        float h = (float)height;
+        float matrix_m[6] = {0};
+        j = 0;
+        for (i = 0; i < 6; i++)
+        {
+            if (has_theta[i] == 0)
+            {
+                theta[i] = f32_in_buffer[1][b * in_attr[1]->shape->data[0] + j];
+                j ++;
+            }
+        }
+
+        if (align_corners && w > 1)
+        {
+            w = w - 1;
+        }
+
+        if (align_corners && h > 1)
+        {
+            h = h - 1;
+        }
+
+        matrix_m[0] = theta[4] * _w / w;
+        matrix_m[2] = theta[3] * _w / h;
+        matrix_m[4] = (theta[5] - theta[4] - theta[3] + 1) * _w * 0.5f;
+        matrix_m[1] = theta[1] * _h / w;
+        matrix_m[3] = theta[0] * _h / h;
+        matrix_m[5] = (theta[2] - theta[1] - theta[0] + 1) * _h * 0.5f;
+        for (c = 0; c < depth; c++)
+        {
+            for (y = 0; y < height; y++)
+            {
+                for (x = 0; x < width; x++)
+                {
+                    float xf = 0;
+                    float yf = 0;
+                    float tl = 0, tr = 0, bl = 0, br = 0;
+                    float ar = 0, ab = 0, al = 0, at = 0;
+
+                    _transform_affine(x, y, matrix_m, &xf, &yf);
+
+                    xf = xf < 0 ? xf - 1 : xf;
+                    yf = yf < 0 ? yf - 1 : yf;
+                    ar = xf - floorf(xf);
+                    ab = yf - floorf(yf);
+                    al = 1.0f - ar;
+                    at = 1.0f - ab;
+
+                    tl = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf), floorf(yf), c, b);
+                    tr = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf) + 1, floorf(yf), c, b);
+                    bl = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf), floorf(yf) + 1, c, b);
+                    br = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf) + 1, floorf(yf) + 1, c, b);
+
+                    f32_out_buffer[0][index ++] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+                }
+            }
+        }
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _spatial_transformer_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _spatial_transformer_kernel_param_def );
+
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_SPATIAL_TRANSFORMER_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t has_theta_1_1  = vsi_nn_kernel_param_get_int32( params, "has_theta_1_1" );
+    int32_t has_theta_1_2  = vsi_nn_kernel_param_get_int32( params, "has_theta_1_2" );
+    int32_t has_theta_1_3  = vsi_nn_kernel_param_get_int32( params, "has_theta_1_3" );
+    int32_t has_theta_2_1  = vsi_nn_kernel_param_get_int32( params, "has_theta_2_1" );
+    int32_t has_theta_2_2  = vsi_nn_kernel_param_get_int32( params, "has_theta_2_2" );
+    int32_t has_theta_2_3  = vsi_nn_kernel_param_get_int32( params, "has_theta_2_3" );
+    float theta_1_1  = vsi_nn_kernel_param_get_float32( params, "theta_1_1" );
+    float theta_1_2  = vsi_nn_kernel_param_get_float32( params, "theta_1_2" );
+    float theta_1_3  = vsi_nn_kernel_param_get_float32( params, "theta_1_3" );
+    float theta_2_1  = vsi_nn_kernel_param_get_float32( params, "theta_2_1" );
+    float theta_2_2  = vsi_nn_kernel_param_get_float32( params, "theta_2_2" );
+    float theta_2_3  = vsi_nn_kernel_param_get_float32( params, "theta_2_3" );
+    int32_t align_corners  = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _SPATIAL_TRANSFORMER_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 );
+            node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 );
+            node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 );
+            node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 );
+            node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 );
+            node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 );
+            node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 );
+            node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 );
+            node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 );
+            node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 );
+            node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 );
+            node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 );
+            node_params[ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SPATIAL_TRANSFORMER_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] );
+            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] );
+            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] );
+            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] );
+            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] );
+            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] );
+            vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] );
+            vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] );
+            vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] );
+            vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] );
+            vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] );
+            vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] );
+            vsi_nn_kernel_scalar_release( &node_params[ALIGN_CORNERS] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( spatial_transformer, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c
@ -0,0 +1,185 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.sync_host")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _sync_host_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _SYNC_HOST_PARAM_NUM  _cnt_of_array( _sync_host_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    void *in_buffer[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t  i = 0;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        in_buffer[i] = vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], FALSE );
+        CHECK_PTR_FAIL_GOTO( in_buffer[i], "Create input buffer fail.", final );
+    }
+
+    for(i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        out_bytes[i] = vsi_nn_kernel_tensor_attr_get_bytes( out_attr[i] );
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write( output[i], out_attr[i],
+                in_buffer[i], out_bytes[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (in_buffer[i])
+        {
+            free(in_buffer[i]);
+            in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _sync_host_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _sync_host_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_SYNC_HOST_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _SYNC_HOST_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SYNC_HOST_PARAM_NUM );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( sync_host, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c
@ -0,0 +1,219 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.tensorstackconcat")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _tensorstackconcat_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _TENSORSTACKCONCAT_PARAM_NUM  _cnt_of_array( _tensorstackconcat_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    uint32_t  i = 0;
+    uint32_t  depth = 0;
+    uint32_t  height = 1;
+    uint32_t  width = 0;
+    uint32_t  index = 0;
+    uint32_t  c = 0, y = 0, x = 0;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        f32_out_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( output[i], out_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+    }
+
+    depth = in_attr[0]->shape->data[2];
+    height = in_attr[0]->shape->data[1];
+    width = in_attr[0]->shape->data[0];
+    index = (int32_t)f32_in_buffer[1][0];
+
+    for (c = 0; c < depth; c++)
+    {
+        for (y = 0; y < height; y++)
+        {
+            for (x = 0; x < width; x++)
+            {
+                int32_t i_idx = c * width * height + y * width + x;
+                int32_t o_idx = (c * out_attr[0]->shape->data[1] + index ) * out_attr[0]->shape->data[0] + x;
+                float value = f32_in_buffer[0][i_idx];
+
+                f32_out_buffer[0][o_idx] = value;
+            }
+        }
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _tensorstackconcat_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _tensorstackconcat_kernel_param_def );
+
+    status = VSI_SUCCESS;
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_TENSORSTACKCONCAT_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _TENSORSTACKCONCAT_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _TENSORSTACKCONCAT_PARAM_NUM );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( tensorstackconcat, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c
@ -79,8 +79,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    size_t   out_elements[_OUTPUT_NUM] = {0};
    size_t   out_bytes[_OUTPUT_NUM] = {0};
--- a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
@ -114,7 +114,7 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
        {0, 0, 0}
        };
    vsi_nn_kernel_tensor_attr_t * input_attr   = NULL;
-    vsi_nn_kernel_tensor_attr_t * input1_attr   = NULL;
+    vsi_nn_kernel_tensor_attr_t * input1_attr  = NULL;
    vsi_int_array_t * in_shape                 = NULL;
    float             logE                     = (float)(log10(exp(1.0f)) / log10(2.0f));
    float     scaleIn0        = 1.0f;
@ -224,6 +224,7 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
    SAFE_FREE_TENSOR_ATTR(input_attr);
+    SAFE_FREE_TENSOR_ATTR(input1_attr);

    return status;
 } /* _detect_post_box_initializer() */
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@ -49,6 +49,8 @@ typedef enum
    UNARY_HSIGMOID,
    UNARY_MISH,
    UNARY_ROUND,
+    UNARY_GELU,
+    UNARY_HGELU,
 } unary_type_e;

 /*
@ -84,6 +86,8 @@ typedef enum
 #define HSIGMOID_OPERATION      hard_sigmoid
 #define MISH_OPERATION          mish
 #define ROUND_OPERATION         round
+#define GELU_OPERATION          gelu
+#define HGELU_OPERATION         hard_gelu

 static const struct {
        uint32_t key;
@ -274,6 +278,42 @@ static const struct {
    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   I8   , KERNEL_SOURCE_2D)
    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   F16  , KERNEL_SOURCE_2D)
    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_2D)
+
+    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16,  F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16,  I16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16,  U8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16,  I8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I16,  I16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I16,  F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, U8,   U8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, U8,   F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I8,   I8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I8,   F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, BF16, BF16 , KERNEL_SOURCE_3D)
+
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  I16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  U8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  I8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16,  I16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16,  F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8,   U8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8,   F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8,   I8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8,   F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, BF16, BF16 , KERNEL_SOURCE_2D)
+
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  I16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  U8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  I8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16,  I16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16,  F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8,   U8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8,   F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8,   I8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8,   F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, BF16, BF16 , KERNEL_SOURCE_2D)
 };

 #undef SIN_OPERATION
@ -284,6 +324,8 @@ static const struct {
 #undef HSIGMOID_OPERATION
 #undef MISH_OPERATION
 #undef ROUND_OPERATION
+#undef GELU_OPERATION
+#undef HGELU_OPERATION

 /*
 * Kernel params
@ -403,6 +445,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
        case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ):
        {
            gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
                0x11111111, // TCfg
@ -682,6 +726,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND )
-
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU )

 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c
@ -0,0 +1,243 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+
+// Add kernel hashtable here
+#define EXTRA_ENDING_HASH_KEY( OUT_DTYPE ) \
+        ( ( OUT_DTYPE ) )
+#define EXTRA_ENDING_KERNEL_MAP( OUT_DTYPE ) \
+        { EXTRA_ENDING_HASH_KEY( OUT_DTYPE ), \
+         CVIVANTE_NAMESPACE("evis.extra_ending_"#OUT_DTYPE), \
+         "extra_ending" }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _extra_ending_kernel_map[] =
+{
+    // Register kernel here
+    EXTRA_ENDING_KERNEL_MAP( F16 ),
+    EXTRA_ENDING_KERNEL_MAP( I16 ),
+    EXTRA_ENDING_KERNEL_MAP( U8 ),
+    EXTRA_ENDING_KERNEL_MAP( I8 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _extra_ending_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _EXTRA_ENDING_PARAM_NUM  _cnt_of_array( _extra_ending_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_extra_ending_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * attr  = NULL;
+    vsi_int_array_t * out_shape          = NULL;
+
+    attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    out_shape = attr->shape;
+
+    gpu_param.global_scale[0] = 8;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0];
+    gpu_param.global_size[1] = out_shape->data[1];
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+final:
+    if (attr)
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr );
+        attr = NULL;
+    }
+
+    return status;
+} /* _extra_ending_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e out_dtype;
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = EXTRA_ENDING_HASH_KEY( out_dtype );
+
+    for ( i = 0; i < _cnt_of_array(_extra_ending_kernel_map); i ++ )
+    {
+        if ( _extra_ending_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(_extra_ending_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _extra_ending_kernel_map[i].function_name );
+        kernel->info.parameters  = _extra_ending_kernel_param_def;
+        kernel->info.numParams   = _cnt_of_array( _extra_ending_kernel_param_def );
+        kernel->info.initialize  = _extra_ending_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                _extra_ending_kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _extra_ending_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_EXTRA_ENDING_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    uint32_t rank[3] = {0};
+    int32_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    int32_t i = 0;
+
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
+        shapes[0], &rank[0]);
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
+        shapes[1], &rank[1]);
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num,
+        shapes[2], &rank[2]);
+
+    for (i = 0; i < 2; i++)
+    {
+        reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
+            inputs[i], (uint32_t*)shapes[i], rank[i] );
+    }
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+        outputs[0], (uint32_t*)shapes[2], rank[2] );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
+        inputs[0]->attr.dim_num ) )
+    {
+        goto final;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            vx_border_t border;
+
+            border.mode = VX_BORDER_CONSTANT;
+            border.constant_value.U32 = 0;
+            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
+            CHECK_STATUS_FAIL_GOTO( status, final );
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _EXTRA_ENDING_PARAM_NUM,
+                    reshape_tensors, input_num, &reshape_tensors[2], output_num );
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, node_params, _EXTRA_ENDING_PARAM_NUM );
+            CHECK_STATUS_FAIL_GOTO( status, final );
+        }
+    }
+
+final:
+    for (i = 0; i < 3; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( extra_ending, _setup )
--- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
@ -991,8 +991,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
    int32_t                      _is_ln                 = 0;
    int32_t                      _is_cifg               = 0;
    int32_t                      _is_hybrid             = 0;
-    vsi_nn_kernel_tensor_attr_t* input_attr[9];
-    vsi_nn_kernel_tensor_attr_t* attr[2];
+    vsi_nn_kernel_tensor_attr_t* input_attr[9]          = {NULL};
+    vsi_nn_kernel_tensor_attr_t* attr[2]                = {NULL};;

    status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 5], &_is_ln );
    CHECK_STATUS_FAIL_GOTO(status, final );
--- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
@ -44,6 +44,8 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_3    "moments_axis2"
 #define KERNEL_SOURCE_4    "moments_axis01"
 #define KERNEL_SOURCE_5    "moments_axis012"
+#define KERNEL_SOURCE_6    "moments_u8"
+#define KERNEL_SOURCE_7    "moments_u8_axis012"

 // Add kernel hashtable here
 #define HASH_MOMENTS_KEY(_input0_type, _output_type, _axis_num, _axis0, _axis1, _axis2, _image_2d) \
@ -107,14 +109,19 @@ static const struct {
    TENSOR_MOMENTS_KERNELS(I8,  F16, 2,    KERNEL_SOURCE_3)
    TENSOR_MOMENTS_KERNELS(I16, F16, 2,    KERNEL_SOURCE_3)
    TENSOR_MOMENTS_KERNELS(F16, F16, 2,    KERNEL_SOURCE_3)
+    TENSOR_MOMENTS_KERNELS(U8,  U8,  0,    KERNEL_SOURCE_6)
+    TENSOR_MOMENTS_KERNELS(U8,  U8,  1,    KERNEL_SOURCE_6)
+    TENSOR_MOMENTS_KERNELS(U8,  U8,  2,    KERNEL_SOURCE_6)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8,  F16, 0, 1,       KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS(I8,  F16, 0, 1,       KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS(I16, F16, 0, 1,       KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS(F16, F16, 0, 1,       KERNEL_SOURCE_4)
+    TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8,  U8,  0, 1,       KERNEL_SOURCE_6)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8,  F16, 0, 1, 2,  KERNEL_SOURCE_5)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(I8,  F16, 0, 1, 2,  KERNEL_SOURCE_5)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(I16, F16, 0, 1, 2,  KERNEL_SOURCE_5)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(F16, F16, 0, 1, 2,  KERNEL_SOURCE_5)
+    TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8,  U8,  0, 1, 2,  KERNEL_SOURCE_7)
    TENSOR_MOMENTS_KERNELS_2D(U8,  F16, 0,                KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS_2D(I8,  F16, 0,                KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS_2D(I16, F16, 0,                KERNEL_SOURCE_1)
@ -123,10 +130,13 @@ static const struct {
    TENSOR_MOMENTS_KERNELS_2D(I8,  F16, 1,                KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS_2D(I16, F16, 1,                KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS_2D(F16, F16, 1,                KERNEL_SOURCE_2)
+    TENSOR_MOMENTS_KERNELS_2D(U8,  U8,  0,                KERNEL_SOURCE_6)
+    TENSOR_MOMENTS_KERNELS_2D(U8,  U8,  1,                KERNEL_SOURCE_6)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8,  F16, 0, 1,    KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I8,  F16, 0, 1,    KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I16, F16, 0, 1,    KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(F16, F16, 0, 1,    KERNEL_SOURCE_4)
+    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8,  U8,  0, 1,    KERNEL_SOURCE_6)
 };

 /*
@ -179,31 +189,41 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
        {0, 0, 0},  // localWorkSize: local group size in thread
        {0, 0, 0}}; // globalWorkSize: image size in thread

-    vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL};
+    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL, NULL};
    vsi_int_array_t * input_shape = NULL;
-    float scaleIn = 0;
-    int32_t input_zp = 0;
-    vx_uint32 iter = 0;
-    int32_t sumInZp = 0;
-    int32_t tmpZp1 = 0;
-    float tmpZp2 = 0;
-    float e2InScale = 0;
-    float rowSumScale = 0;
-    int32_t axis = 0;
-    int32_t axis_num = 0;
-    int32_t width = 0;
-    int32_t height = 0;
-    int32_t chn = 0;
-    float dimRatio = 1.0;
-    int32_t iterSize = 16;
-    float   zpScaleSqr_i16 = 0.0f;
-    float   zpScale2_i16 = 0.0f;
-    float   sumScale_i16 = 0.0f;
+    float     scaleIn  = 0;
+    int32_t   input_zp = 0;
+    vx_uint32 iter     = 0;
+    int32_t   sumInZp  = 0;
+    int32_t   tmpZp1   = 0;
+    float     tmpZp2   = 0;
+    float     e2InScale = 0;
+    float     rowSumScale = 0;
+    int32_t   axis     = 0;
+    int32_t   axis_num = 0;
+    int32_t   width    = 0;
+    int32_t   height   = 0;
+    int32_t   chn      = 0;
+    float     dimRatio = 1.0;
+    int32_t   iterSize = 16;
+    float     zpScaleSqr_i16 = 0.0f;
+    float     zpScale2_i16   = 0.0f;
+    float     sumScale_i16   = 0.0f;
+    float     output_ZP[4]   = {0.0f, 0.0f, 0.0f, 0.0f};
+    float     outputScale[4] = {1.0f, 1.0f, 1.0f, 1.0f};
+    float     output_ZP0     = 0.0f;
+    float     outputScale0   = 1;
+    float     output_ZP1     = 0.0f;
+    float     outputScale1   = 1.0f;

    uint32_t pack_key = 0;

    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );

    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
    CHECK_STATUS_FAIL_GOTO(status, OnError );
@ -212,10 +232,13 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    input_shape  = attr[0]->shape;
-    input_zp     = attr[0]->asymm.zero_point;
-    scaleIn      = attr[0]->asymm.scale;

-    if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        input_zp = attr[0]->asymm.zero_point;
+        scaleIn  = attr[0]->asymm.scale;
+    }
+    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        if (attr[0]->dfp.fl > 0)
        {
@ -234,6 +257,57 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
        scaleIn = 1;
    }

+    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        output_ZP0     = (float)attr[1]->asymm.zero_point;
+        outputScale0   = 1.0f / attr[1]->asymm.scale;
+    }
+    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[1]->dfp.fl > 0)
+        {
+            outputScale0 = (float)((int64_t)1 << attr[1]->dfp.fl);
+        }
+        else
+        {
+            outputScale0 = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
+        }
+        output_ZP0 = 0.0f;
+    }
+    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    {
+        outputScale0 = 1.0f;
+        output_ZP0 = 0.0f;
+    }
+
+    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        output_ZP1     = (float)attr[2]->asymm.zero_point;
+        outputScale1   = 1.0f / attr[2]->asymm.scale;
+    }
+    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[2]->dfp.fl > 0)
+        {
+            outputScale1 = (float)((int64_t)1 << attr[2]->dfp.fl);
+        }
+        else
+        {
+            outputScale1 = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
+        }
+        output_ZP1 = 0.0f;
+    }
+    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    {
+        outputScale1 = 1.0f;
+        output_ZP1 = 0.0f;
+    }
+
+    output_ZP[0] = output_ZP0;
+    output_ZP[1] = output_ZP1;
+    outputScale[0] = outputScale0;
+    outputScale[1] = outputScale1;
+
    if(attr[0]->dtype == I16)
    {
        iterSize = 8;
@ -316,10 +390,10 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
    zpScale2_i16 = tmpZp1 * e2InScale;
    sumScale_i16 = sumInZp * scaleIn;

-#define _PACK_SELECT_KEY( IN0_TYPE, AXIS_NUM, FIRST_AXIS )    \
-        (IN0_TYPE | (AXIS_NUM << 8) | (FIRST_AXIS << 16))
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT0_TYPE, AXIS_NUM, FIRST_AXIS )    \
+        (IN0_TYPE | (OUT0_TYPE << 8) | (AXIS_NUM << 16) | (FIRST_AXIS << 24))

-    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, axis_num, axis);
+    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, axis_num, axis);

    {
        gpu_dp_inst_t uniSumU8_16x1 = {{
@ -377,11 +451,22 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16 };

+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
        switch( pack_key )
        {
-        case _PACK_SELECT_KEY( U8,  1, 0):
-        case _PACK_SELECT_KEY( I8,  1, 0):
-        case _PACK_SELECT_KEY( I16, 1, 0):
+        case _PACK_SELECT_KEY( U8,  F16, 1, 0):
+        case _PACK_SELECT_KEY( I8,  F16, 1, 0):
+        case _PACK_SELECT_KEY( I16, F16, 1, 0):
            {
                status  = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
@ -395,22 +480,28 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16);
                status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
+                        &uniConvertHalftoFp16_2x8);
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
-        case _PACK_SELECT_KEY( F16, 1, 0):
+        case _PACK_SELECT_KEY( F16, F16, 1, 0):
            {
                status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
+                        &uniConvertHalftoFp16_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
-        case _PACK_SELECT_KEY( U8,  1, 1):
-        case _PACK_SELECT_KEY( I8,  1, 1):
-        case _PACK_SELECT_KEY( I16, 1, 1):
+        case _PACK_SELECT_KEY( U8,  F16, 1, 1):
+        case _PACK_SELECT_KEY( I8,  F16, 1, 1):
+        case _PACK_SELECT_KEY( I16, F16, 1, 1):
            {
                status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
                        &uniConvert1stUint8SubZpToFp32_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
+                        &uniConvertHalftoFp16_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
                status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
                status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
@ -418,19 +509,23 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
-        case _PACK_SELECT_KEY( F16, 1, 1):
+        case _PACK_SELECT_KEY( F16, F16, 1, 1):
            {
                status = vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
+                        &uniConvertHalftoFp16_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
-        case _PACK_SELECT_KEY( U8,  1, 2):
-        case _PACK_SELECT_KEY( I8,  1, 2):
-        case _PACK_SELECT_KEY( I16, 1, 2):
+        case _PACK_SELECT_KEY( U8,  F16, 1, 2):
+        case _PACK_SELECT_KEY( I8,  F16, 1, 2):
+        case _PACK_SELECT_KEY( I16, F16, 1, 2):
            {
                status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
                        &uniConvert1stUint8SubZpToFp32_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
+                        &uniConvertHalftoFp16_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
                status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
                status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
@ -438,16 +533,18 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
-        case _PACK_SELECT_KEY( F16, 1, 2):
+        case _PACK_SELECT_KEY( F16, F16, 1, 2):
            {
                status = vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
+                        &uniConvertHalftoFp16_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
-        case _PACK_SELECT_KEY( U8,  2, 0):
-        case _PACK_SELECT_KEY( I8,  2, 0):
-        case _PACK_SELECT_KEY( I16, 2, 0):
+        case _PACK_SELECT_KEY( U8,  F16, 2, 0):
+        case _PACK_SELECT_KEY( I8,  F16, 2, 0):
+        case _PACK_SELECT_KEY( I16, F16, 2, 0):
            {
                status  = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
@ -462,12 +559,14 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16);
                status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
+                        &uniConvertHalftoFp16_2x8);
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
-        case _PACK_SELECT_KEY( U8,  3, 0):
-        case _PACK_SELECT_KEY( I8,  3, 0):
-        case _PACK_SELECT_KEY( I16, 3, 0):
+        case _PACK_SELECT_KEY( U8,  F16, 3, 0):
+        case _PACK_SELECT_KEY( I8,  F16, 3, 0):
+        case _PACK_SELECT_KEY( I16, F16, 3, 0):
            {
                status  = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
@ -483,32 +582,85 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16);
                status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
+                        &uniConvertHalftoFp16_2x8);
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
-        case _PACK_SELECT_KEY( F16, 2, 0):
+        case _PACK_SELECT_KEY( F16, F16, 2, 0):
            {
                status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
+                        &uniConvertHalftoFp16_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
-        case _PACK_SELECT_KEY( F16, 3, 0):
+        case _PACK_SELECT_KEY( F16, F16, 3, 0):
            {
                status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
+                        &uniConvertHalftoFp16_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
                status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
+        case _PACK_SELECT_KEY( U8, U8, 1, 0):
+        case _PACK_SELECT_KEY( U8, U8, 1, 1):
+        case _PACK_SELECT_KEY( U8, U8, 1, 2):
+        case _PACK_SELECT_KEY( U8, U8, 2, 0):
+            {
+                status  = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
+                        &uniConvertInt32toUint8_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
+                status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
+                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+                status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_ZP);
+                status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP0", &output_ZP0);
+                status |= vsi_nn_kernel_gpu_add_param(node, "outputScale0", &outputScale0);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP1", &output_ZP1);
+                status |= vsi_nn_kernel_gpu_add_param(node, "outputScale1", &outputScale1);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( U8, U8, 3, 0):
+            {
+                status  = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
+                        &uniConvertInt32toUint8_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
+                status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
+                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+                status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_ZP);
+                status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
        default:
            VSI_ASSERT( FALSE );
            break;
        }
        status = vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8);
        CHECK_STATUS_FAIL_GOTO(status, OnError );
    }
 #undef _PACK_SELECT_KEY
@ -519,6 +671,16 @@ OnError:
        vsi_nn_kernel_tensor_attr_release( &attr[0] );
        attr[0] = NULL;
    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }

    return status;
 }
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
--- a/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c
@ -0,0 +1,292 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+__BEGIN_DECLS
+
+#define SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        ( ( IN_DTYPE << 8 ) | ( OUT_DTYPE ) )
+#define SIGNAL_FRAME_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+        CVIVANTE_NAMESPACE("evis.signal_frame_"#IN_DTYPE"to"#OUT_DTYPE), \
+        "signal_frame" }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _signal_frame_kernel_map[] =
+{
+    // Register kernel here
+    SIGNAL_FRAME_KERNEL_MAP( I16,  I16 ),
+    SIGNAL_FRAME_KERNEL_MAP( F16,  F16 ),
+    SIGNAL_FRAME_KERNEL_MAP( BF16, BF16 ),
+    SIGNAL_FRAME_KERNEL_MAP( U8,   U8 ),
+    SIGNAL_FRAME_KERNEL_MAP( I8,   I8 ),
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _signal_frame_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _SIGNAL_FRAME_PARAM_NUM  _cnt_of_array( _signal_frame_kernel_param_def )
+#define FRAME_STEP      (2)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_signal_frame_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * attr  = NULL;
+    vsi_int_array_t * out_shape          = NULL;
+
+    attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    out_shape = attr->shape;
+
+    gpu_param.global_scale[0] = 16;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    if ( attr->dtype == F16 || attr->dtype == I16 || attr->dtype == U16 || attr->dtype == BF16)
+    {
+        gpu_param.global_scale[0] = 8;
+    }
+    gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0];
+    gpu_param.global_size[1] = out_shape->data[1];
+    gpu_param.global_size[2] = out_shape->data[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+final:
+    if (attr)
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr );
+        attr = NULL;
+    }
+
+    return status;
+} /* _signal_frame_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = SIGNAL_FRAME_HASH_KEY( in_dtype, out_dtype );
+
+    for ( i = 0; i < _cnt_of_array(_signal_frame_kernel_map); i ++ )
+    {
+        if ( _signal_frame_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(_signal_frame_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _signal_frame_kernel_map[i].function_name );
+        kernel->info.parameters  = _signal_frame_kernel_param_def;
+        kernel->info.numParams   = _cnt_of_array( _signal_frame_kernel_param_def );
+        kernel->info.initialize  = _signal_frame_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                _signal_frame_kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _signal_frame_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_SIGNAL_FRAME_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t frame_length  = vsi_nn_kernel_param_get_int32( params, "frame_length" );
+    int32_t frame_step  = vsi_nn_kernel_param_get_int32( params, "frame_step" );
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t pad_end  = vsi_nn_kernel_param_get_int32( params, "pad_end" );
+    float pad_value  = vsi_nn_kernel_param_get_float32( params, "pad_val" );
+    int32_t num_frames = outputs[0]->attr.size[axis + 1];
+    int32_t rank = inputs[0]->attr.dim_num;
+    int32_t inner = 1;
+    int32_t outer = 1;
+    int32_t length_samples = inputs[0]->attr.size[axis];
+    int32_t i = 0;
+    vsi_nn_tensor_t* rs_tensors[2] = { NULL };
+    int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
+
+    for (i = 0; i < axis; i++)
+    {
+        inner *= inputs[0]->attr.size[i];
+    }
+
+    for (i = axis + 1; i < rank; i++)
+    {
+        outer *= inputs[0]->attr.size[i];
+    }
+
+    shape[0][0] = inner;
+    shape[0][1] = length_samples;
+    shape[0][2] = 1;
+    shape[0][3] = outer;
+
+    shape[1][0] = inner;
+    shape[1][1] = frame_length;
+    shape[1][2] = num_frames;
+    shape[1][3] = outer;
+
+    rs_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], (uint32_t*)shape[0], 4 );
+    rs_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], (uint32_t*)shape[1], 4 );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size,
+                rs_tensors[1]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            if ( pad_end )
+            {
+                // Set default border mode.
+                vx_border_t border;
+                uint32_t data = 0;
+                uint32_t dsize = 1;
+
+                vsi_nn_Float32ToDtype(pad_value, (uint8_t*)&data, &outputs[0]->attr.dtype);
+                border.mode = VX_BORDER_CONSTANT;
+                dsize = vsi_nn_GetTypeBytes( inputs[0]->attr.dtype.vx_type );
+                if ( dsize == 1 )
+                {
+                    border.constant_value.U8 = (uint8_t)data;
+                }
+                else if ( dsize == 4 )
+                {
+                    border.constant_value.U32 = data;
+                }
+                else
+                {
+                    border.constant_value.U16 = (uint16_t)data;
+                }
+
+                status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            }
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _SIGNAL_FRAME_PARAM_NUM,
+                    &rs_tensors[0], input_num, &rs_tensors[1], output_num );
+            node_params[FRAME_STEP] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &frame_step );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SIGNAL_FRAME_PARAM_NUM );
+            CHECK_STATUS_FAIL_GOTO( status, final );
+        }
+    }
+final:
+    if (rs_tensors[0])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[0] );
+    }
+
+    if (rs_tensors[1])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[1] );
+    }
+
+    if (node_params[FRAME_STEP])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[FRAME_STEP] );
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( signal_frame, _setup )
--- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
@ -22,7 +22,6 @@
 *
 *****************************************************************************/

-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -40,7 +39,6 @@

 __BEGIN_DECLS

-
 #define _SLICE_KERNEL_SOURCE      "slice"
 #define _SLICE_KERNEL_NAME        CVIVANTE_NAMESPACE("evis.slice")

@ -379,7 +377,6 @@ static vsi_status _query_kernel
    return status;
 } /* _query_kernel() */

-
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
@ -421,7 +418,7 @@ static vsi_nn_kernel_node_t _setup
    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
        reshape_tensors[0]->attr.dim_num ) || input_batch != output_batch )
    {
-        return NULL;
+        goto final;
    }

    image_2d = (rank[0] < 3 || shapes[0][2] == 1);
@ -443,6 +440,12 @@ static vsi_nn_kernel_node_t _setup
        }
    }

+final:
+    for (i = 0; i < _IO_NUM; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
@ -0,0 +1,641 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+typedef enum
+{
+    INTERNAL_KERNEL_GET_MATRIX,
+    INTERNAL_KERNEL_WARP_AFFINE,
+} _internal_kernel_e;
+
+#define _GET_MATRIX_SOURCE        "get_matrix"
+#define _WARP_AFFINE_SOURCE       "warp_affine"
+
+// Add kernel hashtable here
+#define GET_MATRIX_HASH_KEY( IN1_DTYPE, OUT_DTYPE ) \
+        (( IN1_DTYPE << 8 ) | ( OUT_DTYPE ))
+#define GET_MATRIX_KERNEL_MAP( IN1_DTYPE, OUT_DTYPE ) \
+        { GET_MATRIX_HASH_KEY( IN1_DTYPE, OUT_DTYPE ), \
+        CVIVANTE_NAMESPACE("evis.get_matrix_"#IN1_DTYPE"toF32"), \
+        _GET_MATRIX_SOURCE }
+
+#define WARP_AFFINE_HASH_KEY( IN0_DTYPE, OUT_DTYPE ) \
+        (( IN0_DTYPE << 8 ) | ( OUT_DTYPE ))
+#define WARP_AFFINE_KERNEL_MAP( IN0_DTYPE, OUT_DTYPE ) \
+        { WARP_AFFINE_HASH_KEY( IN0_DTYPE, OUT_DTYPE ), \
+          CVIVANTE_NAMESPACE("evis.warp_affine_"#IN0_DTYPE"to"#OUT_DTYPE), \
+          _WARP_AFFINE_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _get_matrix_kernel_map[] =
+{
+    // Register kernel here
+    GET_MATRIX_KERNEL_MAP( F16, F32 ),
+    GET_MATRIX_KERNEL_MAP( I16, F32 ),
+    GET_MATRIX_KERNEL_MAP( U8,  F32 ),
+    GET_MATRIX_KERNEL_MAP( I8,  F32 ),
+};
+
+static const _kernel_map_type _warp_affine_kernel_map[] =
+{
+    // Register kernel here
+    WARP_AFFINE_KERNEL_MAP( F16, F16 ),
+    WARP_AFFINE_KERNEL_MAP( U8,  U8 ),
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _get_matrix_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GET_MATRIX_PARAM_NUM  _cnt_of_array( _get_matrix_kernel_param_def )
+#define HAS_THETA_1_1   (2)
+#define HAS_THETA_1_2   (3)
+#define HAS_THETA_1_3   (4)
+#define HAS_THETA_2_1   (5)
+#define HAS_THETA_2_2   (6)
+#define HAS_THETA_2_3   (7)
+#define THETA_1_1       (8)
+#define THETA_1_2       (9)
+#define THETA_1_3       (10)
+#define THETA_2_1       (11)
+#define THETA_2_2       (12)
+#define THETA_2_3       (13)
+#define I_WIDTH         (14)
+#define I_HEIGHT        (15)
+#define O_WIDTH         (16)
+#define O_HEIGHT        (17)
+
+static vx_param_description_t _warp_affine_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _WARP_AFFINE_PARAM_NUM  _cnt_of_array( _warp_affine_kernel_param_def )
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_get_matrix_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * attr  = NULL;
+    vsi_int_array_t * in_shape          = NULL;
+    float    theta[8] = {0};
+    float    input_scale = 1.0f;
+    float    input_tail  = 0;
+    float    input_w = 1.0f;
+    float    input_h = 1.0f;
+    float    output_w = 1.0f;
+    float    output_h = 1.0f;
+    float    scale[4] = {0};
+
+    attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+
+    if ( attr->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        int32_t fl = attr->dfp.fl;
+        if (fl > 0)
+        {
+            input_scale = 1.0f / (float) ((int64_t)1 << fl);
+        }
+        else
+        {
+            input_scale = (float)((int64_t)1 << -fl);
+        }
+    }
+    else if ( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        input_scale  = attr->asymm.scale;
+        input_tail = 0 - attr->asymm.zero_point * input_scale;
+    }
+
+    in_shape  = attr->shape;
+
+    status  = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_1], &theta[0]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_2], &theta[1]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_3], &theta[2]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_1], &theta[4]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_2], &theta[5]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_3], &theta[6]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[I_WIDTH], &input_w);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[I_HEIGHT], &input_h);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[O_WIDTH], &output_w);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[O_HEIGHT], &output_h);
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    scale[0] = input_w / output_w;
+    scale[1] = input_h / output_h;
+    scale[2] = input_w / output_h;
+    scale[3] = input_h / output_w;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_size[0] = 1;
+    gpu_param.global_size[1] = in_shape->data[1];
+
+    status = vsi_nn_kernel_gpu_add_param( node,
+        "theta_1", &theta[0] );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "theta_2", &theta[4] );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "scale", &scale );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "input_scale", &input_scale );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "input_tail", &input_tail );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+final:
+    if (attr)
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr );
+        attr = NULL;
+    }
+
+    return status;
+} /* _get_matrix_initializer() */
+
+DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * attr[2]  = {NULL};
+    vsi_int_array_t * out_shape          = NULL;
+    float    input_scale = 1.0f;
+    float    input_tail  = 0;
+    float    output_scale = 1.0f;
+    float    output_zp  = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        int32_t fl = attr[0]->dfp.fl;
+        if (fl > 0)
+        {
+            input_scale = 1.0f / (float) ((int64_t)1 << fl);
+        }
+        else
+        {
+            input_scale = (float)((int64_t)1 << -fl);
+        }
+    }
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        input_scale  = attr[0]->asymm.scale;
+        input_tail = 0 - attr[0]->asymm.zero_point * input_scale;
+    }
+
+    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        int32_t fl = attr[1]->dfp.fl;
+
+        if (fl >= 0)
+        {
+            output_scale = (vx_float32) ((vx_int64)1 << fl);
+        }
+        else if (fl < 0)
+        {
+            output_scale = 1.0f / (vx_float32) ((vx_int64)1 << -fl);
+        }
+    }
+    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        output_scale   = 1.0f / attr[1]->asymm.scale;;
+        output_zp = (float)attr[1]->asymm.zero_point;
+    }
+
+    out_shape  = attr[1]->shape;
+
+    gpu_param.global_scale[0] = 2;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = out_shape->data[1];
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    do
+    {
+        gpu_dp_inst_t uniConvertDatatoF32_0_4x4 = {{
+            0x01010101, // TCfg
+            0x01010000, // ASelt
+            0x00010000, 0x00010000, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniConvertDatatoF32_1_4x4 = {{
+            0x01010101, // TCfg
+            0x01010000, // ASelt
+            0x00030002, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractInteger_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertDatatoF32_0_4x4", &uniConvertDatatoF32_0_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDatatoF32_1_4x4", &uniConvertDatatoF32_1_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale);
+        status |= vsi_nn_kernel_gpu_add_param( node, "input_tail", &input_tail);
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
+        if (attr[1]->dtype == F16)
+        {
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniExtract8Data_2x8", &uniExtractHalf8_2x8 );
+        }
+        else
+        {
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniExtract8Data_2x8", &uniExtractInteger_2x8 );
+        }
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }while(0);
+
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _warp_affine_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    const uint32_t hashkey,
+    _internal_kernel_e kernel_id
+    )
+{
+    vx_kernel_initialize_f  initializer = NULL;
+    vx_param_description_t * param_def;
+    vsi_status status = VSI_FAILURE;
+    const _kernel_map_type* kernel_map;
+    size_t kernel_map_size;
+    size_t param_size;
+    uint32_t i;
+
+    switch( kernel_id )
+    {
+        case INTERNAL_KERNEL_GET_MATRIX:
+            initializer = _get_matrix_initializer;
+            kernel_map = _get_matrix_kernel_map;
+            kernel_map_size = _cnt_of_array( _get_matrix_kernel_map );
+            param_def = _get_matrix_kernel_param_def;
+            param_size = _GET_MATRIX_PARAM_NUM;
+            break;
+        case INTERNAL_KERNEL_WARP_AFFINE:
+            initializer = _warp_affine_initializer;
+            kernel_map = _warp_affine_kernel_map;
+            kernel_map_size = _cnt_of_array( _warp_affine_kernel_map );
+            param_def = _warp_affine_kernel_param_def;
+            param_size = _WARP_AFFINE_PARAM_NUM;
+            break;
+        default:
+            VSI_ASSERT( FALSE );
+            return VSI_FAILURE;
+    }
+
+    for( i = 0; i < kernel_map_size; i ++ )
+    {
+        if( kernel_map[i].key == hashkey )
+        {
+            break;
+        }
+    }
+    if( i < kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+#define INTERNAL_KERNEL_SIZE    (2)
+#define MATRIX_INDEX  (0)
+#define WARP_AFFINE_INDEX   (1)
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GET_MATRIX_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t warp_affine_node_params[_WARP_AFFINE_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
+    vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL };
+    vsi_nn_tensor_t * warp_affine_tensors[2] = {NULL};
+    uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
+    int32_t has_theta_1_1  = vsi_nn_kernel_param_get_int32( params, "has_theta_1_1" );
+    int32_t has_theta_1_2  = vsi_nn_kernel_param_get_int32( params, "has_theta_1_2" );
+    int32_t has_theta_1_3  = vsi_nn_kernel_param_get_int32( params, "has_theta_1_3" );
+    int32_t has_theta_2_1  = vsi_nn_kernel_param_get_int32( params, "has_theta_2_1" );
+    int32_t has_theta_2_2  = vsi_nn_kernel_param_get_int32( params, "has_theta_2_2" );
+    int32_t has_theta_2_3  = vsi_nn_kernel_param_get_int32( params, "has_theta_2_3" );
+    float theta_1_1  = vsi_nn_kernel_param_get_float32( params, "theta_1_1" );
+    float theta_1_2  = vsi_nn_kernel_param_get_float32( params, "theta_1_2" );
+    float theta_1_3  = vsi_nn_kernel_param_get_float32( params, "theta_1_3" );
+    float theta_2_1  = vsi_nn_kernel_param_get_float32( params, "theta_2_1" );
+    float theta_2_2  = vsi_nn_kernel_param_get_float32( params, "theta_2_2" );
+    float theta_2_3  = vsi_nn_kernel_param_get_float32( params, "theta_2_3" );
+    int32_t align_corners  = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+    float input_w    = (float)inputs[0]->attr.size[0];
+    float input_h    = (float)inputs[0]->attr.size[1];
+    float output_w   = (float)outputs[0]->attr.size[0];
+    float output_h   = (float)outputs[0]->attr.size[1];
+    int32_t i = 0;
+
+    if (align_corners && output_w > 1)
+    {
+        output_w = output_w - 1;
+    }
+
+    if (align_corners && output_h > 1)
+    {
+        output_h = output_h - 1;
+    }
+
+    // Check if gpu can support the size
+    if( !vsi_nn_kernel_gpu_check_shape(
+        (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+    {
+        ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+        // Assign unique_id
+        ikernels[i]->unique_id = kernel->unique_id;
+    }
+
+    memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) );
+    attr.size[0] = 16;
+    attr.dim_num = 2;
+    attr.dtype.vx_type = VSI_NN_TYPE_UINT16;
+    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    attr.is_const = FALSE;
+    attr.vtl = TRUE;
+    tensors[0] = vsi_nn_CreateTensor( graph, &attr );
+
+    attr.size[3] = attr.size[1];
+    attr.size[2] = attr.size[1] = 1;
+    attr.dim_num = inputs[0]->attr.dim_num;
+    tensors[1] = vsi_nn_reshape_tensor( graph,
+                tensors[0], (uint32_t*)attr.size, attr.dim_num );
+
+    warp_affine_tensors[0] = inputs[0];
+    warp_affine_tensors[1] = tensors[1];
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    hashkeys[MATRIX_INDEX]= GET_MATRIX_HASH_KEY( in1_dtype, F32 );
+    hashkeys[WARP_AFFINE_INDEX] = WARP_AFFINE_HASH_KEY( in0_dtype, out_dtype );
+
+    status = _query_kernel( ikernels[MATRIX_INDEX], hashkeys[MATRIX_INDEX], INTERNAL_KERNEL_GET_MATRIX );
+    if( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
+    status = _query_kernel( ikernels[WARP_AFFINE_INDEX], hashkeys[WARP_AFFINE_INDEX], INTERNAL_KERNEL_WARP_AFFINE );
+    if( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
+    // Get Matrix
+    node = vsi_nn_kernel_create_node( graph, ikernels[MATRIX_INDEX] );
+    vsi_nn_kernel_node_pack_io( node_params, _GET_MATRIX_PARAM_NUM,
+            &inputs[1], 1, &tensors[0], 1 );
+    node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 );
+    node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 );
+    node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 );
+    node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 );
+    node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 );
+    node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 );
+    node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 );
+    node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 );
+    node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 );
+    node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 );
+    node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 );
+    node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 );
+    node_params[I_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &input_w );
+    node_params[I_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &input_h );
+    node_params[O_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &output_w );
+    node_params[O_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &output_h );
+    status  = vsi_nn_kernel_node_pass_param( node, node_params, _GET_MATRIX_PARAM_NUM );
+    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] );
+    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] );
+    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] );
+    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] );
+    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] );
+    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] );
+    vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] );
+    vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] );
+    vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] );
+    vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] );
+    vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] );
+    vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] );
+    vsi_nn_kernel_scalar_release( &node_params[I_WIDTH] );
+    vsi_nn_kernel_scalar_release( &node_params[I_HEIGHT] );
+    vsi_nn_kernel_scalar_release( &node_params[O_WIDTH] );
+    vsi_nn_kernel_scalar_release( &node_params[O_HEIGHT] );
+    vsi_nn_kernel_node_release( &node );
+
+    // Warp Affine
+    node = vsi_nn_kernel_create_node( graph, ikernels[WARP_AFFINE_INDEX] );
+    if (node)
+    {
+        vx_border_t border;
+        border.mode = VX_BORDER_CONSTANT;
+        border.constant_value.U32 = 0;
+        border.constant_value.S16 = 0;
+        border.constant_value.U8 = 0;
+        if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 &&
+            inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+        {
+            border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+        }
+        status  = vsi_nn_kernel_node_set_border( node, &border );
+        VSI_ASSERT( status == VSI_SUCCESS );
+    }
+    vsi_nn_kernel_node_pack_io( warp_affine_node_params, _WARP_AFFINE_PARAM_NUM,
+            warp_affine_tensors, 2, outputs, 1 );
+    status  = vsi_nn_kernel_node_pass_param( node, warp_affine_node_params, _WARP_AFFINE_PARAM_NUM );
+final:
+    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+    {
+        if( ikernels[i] )
+        {
+            vsi_nn_kernel_release( &ikernels[i] );
+        }
+        if( tensors[i] )
+        {
+            vsi_nn_ReleaseTensor( &tensors[i] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( spatial_transformer, _setup )
--- a/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c
@ -0,0 +1,248 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define KERNEL_SOURCE    "tensorstackconcat",
+
+#define HASH_SH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d ) \
+    (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (_image_2d))
+
+#define PACK_KERNEL_8BITS_MAP(SRC_TYPE, OUT_TYPE) \
+    {   HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 0), \
+        CVIVANTE_NAMESPACE("evis.tensorstackconcat_8bits"), \
+        KERNEL_SOURCE },
+
+#define PACK_KERNEL_8BITS_MAP_2D(SRC_TYPE, OUT_TYPE) \
+    {   HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 1), \
+        CVIVANTE_NAMESPACE("evis.tensorstackconcat_8bits_2D"), \
+        KERNEL_SOURCE },
+
+#define PACK_KERNEL_16BITS_MAP(SRC_TYPE, OUT_TYPE) \
+    {   HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 0), \
+        CVIVANTE_NAMESPACE("evis.tensorstackconcat_16bits"), \
+        KERNEL_SOURCE },
+
+#define PACK_KERNEL_16BITS_MAP_2D(SRC_TYPE, OUT_TYPE) \
+    {   HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 1), \
+        CVIVANTE_NAMESPACE("evis.tensorstackconcat_16bits_2D"), \
+        KERNEL_SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _tensorstackconcat_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_8BITS_MAP( I8, I8 )
+    PACK_KERNEL_8BITS_MAP( U8, U8 )
+    PACK_KERNEL_8BITS_MAP_2D( I8, I8 )
+    PACK_KERNEL_8BITS_MAP_2D( U8, U8 )
+
+    PACK_KERNEL_16BITS_MAP( F16,  F16 )
+    PACK_KERNEL_16BITS_MAP( BF16, BF16 )
+    PACK_KERNEL_16BITS_MAP( I16,  I16 )
+    PACK_KERNEL_16BITS_MAP_2D( F16,  F16 )
+    PACK_KERNEL_16BITS_MAP_2D( BF16, BF16 )
+    PACK_KERNEL_16BITS_MAP_2D( I16,  I16 )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _tensorstackconcat_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _TENSORSTACKCONCAT_PARAM_NUM  _cnt_of_array( _tensorstackconcat_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_tensorstackconcat_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
+    vsi_int_array_t * in_shape             = NULL;
+    // Add initializer
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+
+    in_shape  = input_attr->shape;
+
+    if (input_attr->dtype == I16 || input_attr->dtype == F16)
+    {
+        gpu_param.global_scale[0]  = 8;
+    }
+    else
+    {
+        gpu_param.global_scale[0]  = 16;
+    }
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0] = gpu_align_p2(
+            (in_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = 1;
+    gpu_param.global_size[2] = in_shape->size > 2 ? in_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(input_attr);
+#undef SAFE_FREE_TENSOR_ATTR
+
+    return status;
+} /* _tensorstackconcat_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _tensorstackconcat_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _tensorstackconcat_kernel_map );
+    vx_param_description_t * param_def  = _tensorstackconcat_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _tensorstackconcat_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_SH_KEY( in_dtype, out_dtype, image_2d );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _tensorstackconcat_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_TENSORSTACKCONCAT_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+
+    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
+    status = _query_kernel( kernel, inputs, outputs, image_2d );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _TENSORSTACKCONCAT_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _TENSORSTACKCONCAT_PARAM_NUM );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( tensorstackconcat, _setup )
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@ -444,14 +444,15 @@ static vsi_status _gpu_register
        if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
        {
            cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
-                    "-cl-viv-vx-extension -D VX_VERSION=2" );
+                    "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
+                    context->config.use_40bits_va );
        }
    }
    else
    {
        cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
-                "-cl-viv-vx-extension -D VX_VERSION=%d",
-                context->config.evis.ver );
+                "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
+                context->config.evis.ver, context->config.use_40bits_va );
    }
    // Pack build option
    if( kernel->gpu.sources[active_fmt].build_option.data )
@ -812,7 +813,6 @@ void vsi_nn_kernel_add_build_option
    }
    snprintf( &buf[org_size], item_size + 2, " %s", option );
    build_option->data = buf;
-
 } /* vsi_nn_kernel_add_build_option() */

 void vsi_nn_kernel_release
@ -1224,18 +1224,7 @@ vsi_status vsi_nn_kernel_pirority_set

 static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
 {
-    char *envctrl;
-    static int32_t enableShader = -1;
-
-    if (enableShader == -1)
-    {
-        enableShader = 1;
-        envctrl = getenv("VIV_VX_ENABLE_SHADER");
-        if (envctrl)
-        {
-            enableShader = atoi(envctrl);
-        }
-    }
+    int32_t enableShader = graph->ctx->options.enable_shader;

 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
    if ( graph->ctx->config.subGroupSize == 0 )
@ -1251,4 +1240,3 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)

    return FALSE;
 }
-
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@ -127,5 +127,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_sigmoid)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(clip)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(relu_keras)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(erf)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(gelu)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_gelu)

 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@ -78,6 +78,59 @@ static float mish_eval(float data, float alpha)
    return data;
 }

+static float erf_eval(float x)
+{
+    float res = 0;
+    float tmp = x;
+    float factorial = 1; /*n!*/
+    float x_pow = x;
+    int32_t one = 1;
+    int32_t n = 1;
+
+    if (x <= -3)
+    {
+        return -1;
+    }
+    else if (x >= 3)
+    {
+        return 1;
+    }
+
+    while (vsi_abs(tmp) > 1e-5)
+    {
+        res += tmp;
+
+        factorial *= n;
+        one *= -1;
+        x_pow *= x * x;
+        tmp = one / factorial * x_pow / ( 2 * n + 1);
+
+        n ++;
+    }
+#define VSI_MUL2_RSQRTPI    (1.1283791670955126f)
+
+    res *= VSI_MUL2_RSQRTPI;
+
+    return res;
+}
+
+static float gelu_eval(float data, float alpha)
+{
+    data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f))));
+
+    return data;
+}
+
+
+#define VSI_SQRT_2_RCP_PI  0.7978845834732056f
+static float hgelu_eval(float data, float alpha)
+{
+    float cdf = (float)(0.5f * (1.0f + tanh((VSI_SQRT_2_RCP_PI *
+        (data + 0.044715f * data * data * data)))));
+
+    return data * cdf;
+}
+
 #ifdef VX_USER_LOOKUP_TABLE_SUPPORT
 static int32_t _lut_comparator(const void *pa, const void *pb)
 {
@ -232,6 +285,8 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( log,          log_eval )
 REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( elu,          elu_eval )
 REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( neg,          neg_eval )
 REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_sigmoid, hsigmoid_eval )
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( gelu,         gelu_eval )
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_gelu,    hgelu_eval )

 #undef REGISTER_ELTWISE_UNARY_OPENVX_KERNEL

--- a/src/tim/vx/internal/src/kernel/vx/erf_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/erf_vx.c
@ -38,8 +38,9 @@ typedef struct _sort_lut_s
    float val;
 } sort_lut;

-static float erf_eval(float x)
+static float erf_eval(float _x)
 {
+    float x = vsi_clamp(_x, -2, 2);
    float res = 0;
    float tmp = x;
    float factorial = 1; /*n!*/
--- a/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl
@ -1,4 +1,10 @@

+#define READ_IMAGEF_ARRAY2D(dest, tensor, coord) \
+    do { \
+        int depth = get_image_array_size(tensor); \
+        _viv_asm(CLAMP0MAX, coord_in0.z, coord_in0.z, in0_depth - 1); \
+        dest = read_imagef(tensor, coord); \
+       } while(0)
 __kernel void batch_norm_F32toF32
    (
    __read_only  image2d_array_t input,
@ -17,11 +23,11 @@ __kernel void batch_norm_F32toF32
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);

    float4 src, mean, var, gamma, beta;
-    readImage2DArray(src, input, coord);
-    readImage2DArray(mean, Mean, coord);
-    readImage2DArray(var, Variance, coord);
-    readImage2DArray(gamma, Gamma, coord);
-    readImage2DArray(beta, Beta, coord);
+    READ_IMAGEF_2DARRAY(src, input, coord);
+    READ_IMAGEF_2DARRAY(mean, Mean, coord);
+    READ_IMAGEF_2DARRAY(var, Variance, coord);
+    READ_IMAGEF_2DARRAY(gamma, Gamma, coord);
+    READ_IMAGEF_2DARRAY(beta, Beta, coord);

    float4 dst;
    src.x = src.x - mean.x;
@ -81,11 +87,11 @@ __kernel void batch_norm_U8toU8

    uint4 data;
    float4 src, mean, var, gamma, beta;
-    readImage2DArray(data, input, coord);
-    readImage2DArray(mean, Mean, coord);
-    readImage2DArray(var, Variance, coord);
-    readImage2DArray(gamma, Gamma, coord);
-    readImage2DArray(beta, Beta, coord);
+    READ_IMAGEF_2DARRAY(data, input, coord);
+    READ_IMAGEF_2DARRAY(mean, Mean, coord);
+    READ_IMAGEF_2DARRAY(var, Variance, coord);
+    READ_IMAGEF_2DARRAY(gamma, Gamma, coord);
+    READ_IMAGEF_2DARRAY(beta, Beta, coord);

    src = convert_float4(data) * input_scale - input_tail;
    src.x = src.x - mean.x;
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl
@ -18,11 +18,19 @@ inline Image create_image_from_image2d(image2d_t input, int stride_x)
    int8 desc;
    _viv_asm(COPY, desc, input, sizeof(desc));

+#if (USE_40BITS_VA==0)
+    uint address = as_uint(desc.s0);
+    int stride_y = desc.s1;
+#else
+    ulong address = as_ulong(desc.s05);
+    int stride_y = desc.s6;
+#endif
+
    Image img =
    {
-        .ptr                           = (uchar*)desc.s0,
+        .ptr                           = (uchar*)address,
        .stride_x                      = stride_x,
-        .stride_y                      = desc.s1
+        .stride_y                      = stride_y
    };

    return img;
@ -36,53 +44,60 @@ typedef struct Tensor
    int             stride_z;
 } Tensor;

-inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord)
+inline uchar* get_tensor_ptr_from_coord(Tensor t, int4 coord)
 {
    return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z;
 }

 inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)
 {
+#if (USE_40BITS_VA==0)
    int8 desc;
    _viv_asm(COPY, desc, input, sizeof(desc));

+    uint address = as_uint(desc.s0);
+    int stride_y = desc.s1;
+    int stride_z = desc.s4;
+#else
+    int16 desc;
+    _viv_asm(COPY, desc, input, sizeof(desc));
+
+    ulong address = as_ulong(desc.s05);
+    int stride_y = desc.s6;
+    int stride_z = desc.sa;
+#endif
+
    Tensor t =
    {
-        .ptr                           = (uchar*)desc.s0,
+        .ptr                           = (uchar*)address,
        .stride_x                      = stride_x,
-        .stride_y                      = desc.s1,
-        .stride_z                      = desc.s4
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
    };

    return t;
 }

-#define readImage2DArray(Dest, Image, Coord)         \
-    do {                                                       \
-       int8 desc;                                              \
-       _viv_asm(COPY, desc, Image, sizeof(desc));              \
-       _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \
-       int baseAddr =  (int)(Coord).w * desc.s4 + desc.s0;     \
-       _viv_asm(MOV, (Coord).w, baseAddr);                     \
-       _viv_asm(IMAGE_READ_3D, Dest, Image, (Coord).xyww);     \
-    } while (0)
+#define READ_IMAGEF_2DARRAY(dest, tensor, coord) \
+    do { \
+        int depth = get_image_array_size(tensor); \
+        int4 coord_in = coord; \
+        _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \
+        dest = read_imagef(tensor, coord_in); \
+       } while(0)

-#define writeImage2DArray(Image, Coord, Color)                 \
-    do {                                                       \
-       int8 desc;                                              \
-       _viv_asm(COPY, desc, Image, sizeof(desc));              \
-       _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \
-       int baseAddr =  (int)(Coord).w * desc.s4 + desc.s0;     \
-       _viv_asm(MOV, (Coord).w, baseAddr);                     \
-       _viv_asm(IMAGE_WRITE_3D, Color, Image, (Coord).xyww);   \
-    } while (0)
+#define READ_IMAGEI_2DARRAY(dest, tensor, coord) \
+    do { \
+        int depth = get_image_array_size(tensor); \
+        int4 coord_in = coord; \
+        _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \
+        dest = read_imagei(tensor, coord_in); \
+       } while(0)

-#define readImage(Dest, Image, Coord)               \
-    do {                                            \
-       _viv_asm(IMAGE_READ, Dest, Image, Coord);    \
-    } while (0)
-
-#define writeImage(Image, Coord, Color)             \
-    do {                                            \
-       _viv_asm(IMAGE_WRITE, Color, Image, Coord);   \
-    } while (0)
+#define READ_IMAGEUI_2DARRAY(dest, tensor, coord) \
+    do { \
+        int depth = get_image_array_size(tensor); \
+        int4 coord_in = coord; \
+        _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \
+        dest = read_imageui(tensor, coord_in); \
+       } while(0)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
@ -1,12 +1,12 @@

-float4 eltwise_unary_sin(float4 x, float alpha)
+float eltwise_unary_sin(float x, float alpha)
 {
    return native_sin(x);
 }

 #define logE        (1.44269502f)
 #define twoLogE     (logE * 2.0f)
-float4 eltwise_unary_exp(float4 x, float alpha)
+float eltwise_unary_exp(float x, float alpha)
 {
    x *= logE;
    x = exp2(x);
@ -14,33 +14,33 @@ float4 eltwise_unary_exp(float4 x, float alpha)
 }

 #define rlogE    (0.693147182f)
-float4 eltwise_unary_log(float4 x, float alpha)
+float eltwise_unary_log(float x, float alpha)
 {
    x = log2(x);
    return x * rlogE;
 }

-float4 eltwise_unary_elu(float4 val, float alpha)
+float eltwise_unary_elu(float val, float alpha)
 {
-    float4 x = val * logE;
+    float x = val * logE;
    x = exp2(x) * alpha - alpha;

    return val < 0 ? x : val;
 }

-float4 eltwise_unary_neg(float4 x, float alpha)
+float eltwise_unary_neg(float x, float alpha)
 {
    return x * -1;
 }

-float4 eltwise_unary_hard_sigmoid(float4 x, float alpha)
+float eltwise_unary_hard_sigmoid(float x, float alpha)
 {
    x = 0.2 * x + 0.5;
    x = clamp(x, 0, 1);
    return x;
 }

-float4 _softrelu(float4 x, float alpha)
+float _softrelu(float x, float alpha)
 {
    x *= logE;
    x = exp2(x);
@ -49,7 +49,7 @@ float4 _softrelu(float4 x, float alpha)
    return x * rlogE;
 }

-float4 _tanh(float4 x, float alpha)
+float _tanh(float x, float alpha)
 {
    x *= -twoLogE;
    x = 1 + exp2(x);
@ -57,16 +57,60 @@ float4 _tanh(float4 x, float alpha)
    return (2 * x - 1);
 }

-float4 eltwise_unary_mish(float4 x, float alpha)
+float eltwise_unary_mish(float x, float alpha)
 {
-    float4 y = _softrelu(x, alpha);
+    float y = _softrelu(x, alpha);
    x = x * _tanh(y, alpha);
    return x;
 }

-float4 eltwise_unary_round(float4 x, float alpha)
+float eltwise_unary_round(float x, float alpha)
 {
-    return convert_float4(convert_int4_rte(x));
+    return convert_float(convert_int_rte(x));
+}
+
+#define MUL2_RSQRTPI    (1.1283791670955126f)
+float erf_eval(float x)
+{
+    float res = 0;
+    float tmp = x;
+    float factorial = 1;
+    float x_pow = x;
+    float one = 1.0f;
+    float n = 1;
+
+    if (x <= -3)
+        return -1;
+    else if (x >= 3)
+        return 1;
+
+    while (fabs(tmp) > 1e-5)
+    {
+        res += tmp;
+
+        factorial *= n;
+        one *= -1;
+        x_pow *= x * x;
+        tmp = one / factorial * x_pow / ( 2 * n + 1);
+
+        n += 1.0f;
+    }
+    return res * MUL2_RSQRTPI;
+}
+#define RSQRT2      (0.70710678118654752440084436210485f)
+float eltwise_unary_gelu(float x, float alpha)
+{
+    x = 0.5f * x * (1 + erf_eval(x * RSQRT2));
+
+    return x;
+}
+
+#define SQRT_2_RCP_PI  0.7978845834732056f
+float eltwise_unary_hard_gelu(float x, float alpha)
+{
+    float cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *
+                        (x + 0.044715f * x * x * x), 0);
+    return x * cdf;
 }

 #define ELTWISE_UNARY_F32(func_name) \
@ -85,9 +129,10 @@ __kernel void func_name##_F32toF32 \
 \
    float4 src = read_imagef(input, coord); \
 \
-    float4 dst = eltwise_unary_##func_name(src, alpha); \
+    float4 dst = 0; \
+    dst.x = eltwise_unary_##func_name(src.x, alpha); \
 \
-    write_imagef(output, coord, dst); \
+    write_imagef(output, coord, dst.xxxx); \
 }
 ELTWISE_UNARY_F32(sin)
 ELTWISE_UNARY_F32(exp)
@ -97,6 +142,8 @@ ELTWISE_UNARY_F32(neg)
 ELTWISE_UNARY_F32(mish)
 ELTWISE_UNARY_F32(hard_sigmoid)
 ELTWISE_UNARY_F32(round)
+ELTWISE_UNARY_F32(gelu)
+ELTWISE_UNARY_F32(hard_gelu)

 #define ELTWISE_UNARY_F32_2D(func_name) \
 __kernel void func_name##_F32toF32_2D \
@ -114,9 +161,10 @@ __kernel void func_name##_F32toF32_2D \
 \
    float4 src = read_imagef(input, coord); \
 \
-    float4 dst = eltwise_unary_##func_name(src, alpha); \
+    float4 dst = 0; \
+    dst.x = eltwise_unary_##func_name(src.x, alpha); \
 \
-    write_imagef(output, coord, dst); \
+    write_imagef(output, coord, dst.xxxx); \
 }
 ELTWISE_UNARY_F32_2D(sin)
 ELTWISE_UNARY_F32_2D(exp)
@ -126,6 +174,8 @@ ELTWISE_UNARY_F32_2D(neg)
 ELTWISE_UNARY_F32_2D(mish)
 ELTWISE_UNARY_F32_2D(hard_sigmoid)
 ELTWISE_UNARY_F32_2D(round)
+ELTWISE_UNARY_F32_2D(gelu)
+ELTWISE_UNARY_F32_2D(hard_gelu)

 #define ELTWISE_UNARY_U8(func_name) \
 __kernel void func_name##_U8toU8 \
@ -144,7 +194,7 @@ __kernel void func_name##_U8toU8 \
    uint4 src = read_imageui(input, coord); \
    float4 data = convert_float4(src) * inputScale - inputTail; \
 \
-    data = eltwise_unary_##func_name(data, alpha); \
+    data.x = eltwise_unary_##func_name(data.x, alpha); \
    uint4 dst = convert_uint4(data * outputScale + outputZP); \
 \
    write_imageui(output, coord, dst); \
@ -157,6 +207,8 @@ ELTWISE_UNARY_U8(neg)
 ELTWISE_UNARY_U8(mish)
 ELTWISE_UNARY_U8(hard_sigmoid)
 ELTWISE_UNARY_U8(round)
+ELTWISE_UNARY_U8(gelu)
+ELTWISE_UNARY_U8(hard_gelu)

 #define ELTWISE_UNARY_U8_2D(func_name) \
 __kernel void func_name##_U8toU8_2D \
@ -175,7 +227,7 @@ __kernel void func_name##_U8toU8_2D \
    uint4 src = read_imageui(input, coord); \
    float4 data = convert_float4(src) * inputScale - inputTail; \
 \
-    data = eltwise_unary_##func_name(data, alpha); \
+    data.x = eltwise_unary_##func_name(data.x, alpha); \
    uint4 dst = convert_uint4(data * outputScale + outputZP); \
 \
    write_imageui(output, coord, dst); \
@ -188,6 +240,8 @@ ELTWISE_UNARY_U8_2D(neg)
 ELTWISE_UNARY_U8_2D(mish)
 ELTWISE_UNARY_U8_2D(hard_sigmoid)
 ELTWISE_UNARY_U8_2D(round)
+ELTWISE_UNARY_U8_2D(gelu)
+ELTWISE_UNARY_U8_2D(hard_gelu)

 __kernel void neg_I32toI32
    (
--- a/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl
@ -1,6 +1,7 @@
 #define MUL2_RSQRTPI    (1.1283791670955126f)
-float eltwise_unary_erf(float x)
+float eltwise_unary_erf(float _x)
 {
+    float x = clamp(_x, -2, 2);
    float res = 0;
    float tmp = x;
    float factorial = 1;
--- a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
@ -6,8 +6,8 @@ __kernel void floordiv_F32F32toF32(
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    float4 src0;
    float4 src1;
-    readImage2DArray(src0, input, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEF_2DARRAY(src0, input, coord);
+    READ_IMAGEF_2DARRAY(src1, input1, coord);
    float4 dst  = floor(src0 / src1);
    write_imagef(output, coord, dst);
 }
@ -32,8 +32,8 @@ __kernel void floordiv_I32I32toI32(
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4 src0;
    int4 src1;
-    readImage2DArray(src0, input, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEI_2DARRAY(src0, input, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);
    int4 dst  = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));
    write_imagei(output, coord, dst);
 }
@ -64,8 +64,8 @@ __kernel void floordiv_I32I32toU8(
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4 src0;
    int4 src1;
-    readImage2DArray(src0, input, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEI_2DARRAY(src0, input, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);
    uint4 dst  = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);
    write_imageui(output, coord, dst);
 }
@ -102,8 +102,8 @@ __kernel void floordiv_U8U8toU8(
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    uint4 src0, src1;
    float4 in0, in1, out;
-    readImage2DArray(src0, input, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEUI_2DARRAY(src0, input, coord);
+    READ_IMAGEUI_2DARRAY(src1, input1, coord);
    in0 = convert_float4(src0) * input0Scale + input0Tail;
    in1 = convert_float4(src1) * input1Scale + input1Tail;
    out = floor(in0 / in1) * outputScale + outputTail;
@ -148,8 +148,8 @@ __kernel void floordiv_U8I32toU8(
    uint4 src0;
    int4 src1;
    float4 in0, in1, out;
-    readImage2DArray(src0, input, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEUI_2DARRAY(src0, input, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);
    in0 = convert_float4(src0) * input0Scale + input0Tail;
    in1 = convert_float4(src1);
    out = floor(in0 / in1) * outputScale + outputTail;
--- a/src/tim/vx/internal/src/libnnext/ops/cl/logical_ops.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/logical_ops.cl
@ -7,8 +7,8 @@ __kernel void logical_##name##_I8toI8( \
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
    int4 src0; \
    int4 src1; \
-    readImage2DArray(src0, input, coord); \
-    readImage2DArray(src1, input1, coord); \
+    READ_IMAGEI_2DARRAY(src0, input, coord); \
+    READ_IMAGEI_2DARRAY(src1, input1, coord); \
    int4 dst  = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \
    dst.x = dst.x & 1; \
    write_imagei(output, coord, dst); \
--- a/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl
@ -15,8 +15,8 @@ __kernel void maximum_FP32FP32toFP32

    float4 src0;
    float4 src1;
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEF_2DARRAY(src0, input0, coord);
+    READ_IMAGEF_2DARRAY(src1, input1, coord);

    float4 dst = src0 > src1 ? src0 : src1;

@ -63,8 +63,8 @@ __kernel void maximum_U8U8toU8

    uint4 src0;
    uint4 src1;
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEUI_2DARRAY(src0, input0, coord);
+    READ_IMAGEUI_2DARRAY(src1, input1, coord);

    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
@ -118,8 +118,8 @@ __kernel void maximum_I32I32toI32

    int4 src0;
    int4 src1;
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEI_2DARRAY(src0, input0, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);

    int4 dst = src0 > src1 ? src0 : src1;

--- a/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl
@ -15,8 +15,8 @@ __kernel void minimum_FP32FP32toFP32

    float4 src0;
    float4 src1;
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEF_2DARRAY(src0, input0, coord);
+    READ_IMAGEF_2DARRAY(src1, input1, coord);

    float4 dst = src0 < src1 ? src0 : src1;

@ -63,8 +63,8 @@ __kernel void minimum_U8U8toU8

    uint4 src0;
    uint4 src1;
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEUI_2DARRAY(src0, input0, coord);
+    READ_IMAGEUI_2DARRAY(src1, input1, coord);

    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
@ -118,8 +118,8 @@ __kernel void minimum_I32I32toI32

    int4 src0;
    int4 src1;
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEI_2DARRAY(src0, input0, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);

    int4 dst = src0 < src1 ? src0 : src1;

--- a/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl
@ -9,8 +9,8 @@ __kernel void pow_FP32FP32toFP32

    float4 src0, src1;
    float4 dst;
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEF_2DARRAY(src0, input0, coord);
+    READ_IMAGEF_2DARRAY(src1, input1, coord);

    float4  s0 = sign(src0);
    int4 t0 = convert_int4(src1) & 1;
--- a/src/tim/vx/internal/src/libnnext/ops/cl/prelu.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/prelu.cl
@ -15,8 +15,8 @@ __kernel void prelu_FP32FP32toFP32

    float4 src0;
    float4 src1;
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEF_2DARRAY(src0, input0, coord);
+    READ_IMAGEF_2DARRAY(src1, input1, coord);

    float4 maxData = src0 >= 0 ? src0 : 0;
    float4 minData = src0 < 0 ? src0 : 0;
@ -67,8 +67,8 @@ __kernel void prelu_U8U8toU8

    uint4 src0;
    uint4 src1;
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEUI_2DARRAY(src0, input0, coord);
+    READ_IMAGEUI_2DARRAY(src1, input1, coord);

    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
@ -130,8 +130,8 @@ __kernel void prelu_I32I32toI32

    int4 src0;
    int4 src1;
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEI_2DARRAY(src0, input0, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);

    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
--- a/src/tim/vx/internal/src/libnnext/ops/cl/random_multinomial.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/random_multinomial.cl
@ -1,14 +1,5 @@
 #pragma OPENCL EXTENSION CL_VIV_asm : enable

-inline uchar* get_image2D_array_ptr(image2d_array_t  input)
-{
-    int8 desc;
-    _viv_asm(COPY, desc, input, sizeof(desc));
-    uchar *src_ptr = (uchar*)desc.s0;
-
-    return src_ptr;
-}
-
 uint4 _philox4x32bumpkey(uint4 key)
 {
    uint4 mask = (uint4)((uint)0x9E3779B9, (uint)0xBB67AE85, 0, 0);
@ -61,14 +52,16 @@ __kernel void random_seed(
                 float            re_rand_max
    )
 {
-    __global uint* seeds_ptr = (__global uint*)get_image2D_array_ptr(seeds);
+    Tensor s_tensor = create_tensor_from_image2d_array(seeds, 4);
+    __global uint* seeds_ptr = (__global uint*)s_tensor.ptr;
    seeds_ptr = seeds_ptr;
    uint4 key = vload4(0, seeds_ptr);

    uint4 ctr = (uint4)(0);
    float4 result = 0;

-    __global float* output_ptr = (__global float*)get_image2D_array_ptr(output);
+    Tensor o_tensor = create_tensor_from_image2d_array(output, 4);
+    __global float* output_ptr = (__global float*)o_tensor.ptr;

    for(int i = 0; i < iter; i++)
    {
@ -152,17 +145,20 @@ __kernel void random_multinomial
    int class_size = get_image_width(cdfs);

    int offset = gidy * class_size;
-    __global float* cdf_ptr = (__global float*)get_image2D_array_ptr(cdfs);
+    Tensor cdf_tensor = create_tensor_from_image2d_array(cdfs, 4);
+    __global float* cdf_ptr = (__global uint*)cdf_tensor.ptr;
    __global float* cdfPtr = cdf_ptr + offset;

    int width = get_image_width(randoms);
    offset = coord.x + coord.y * width;
-    __global float* randoms_ptr = (__global float*)get_image2D_array_ptr(randoms);
+    Tensor r_tensor = create_tensor_from_image2d_array(randoms, 4);
+    __global float* randoms_ptr = (__global float*)r_tensor.ptr;
    randoms_ptr = randoms_ptr + offset;

    width = get_image_width(output);
    offset = coord.x + coord.y * width;
-    __global uint* output_ptr = (__global uint*)get_image2D_array_ptr(output);
+    Tensor o_tensor = create_tensor_from_image2d_array(output, 4);
+    __global uint* output_ptr = (__global uint*)o_tensor.ptr;
    output_ptr = output_ptr + offset;

    float4 ran = vload4(0, randoms_ptr);
--- a/src/tim/vx/internal/src/libnnext/ops/cl/relational_ops.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/relational_ops.cl
@ -15,8 +15,8 @@ __kernel void func_name##_F32F32toBOOL8 \
 \
    float4 src0; \
    float4 src1; \
-    readImage2DArray(src0, input0, coord); \
-    readImage2DArray(src1, input1, coord); \
+    READ_IMAGEF_2DARRAY(src0, input0, coord); \
+    READ_IMAGEF_2DARRAY(src1, input1, coord); \
 \
    int4 dst = (src0)comp_op(src1); \
    dst &= 1; \
@ -75,8 +75,8 @@ __kernel void func_name##_U32U32toBOOL8 \
 \
    uint4 data0; \
    uint4 data1; \
-    readImage2DArray(data0, input0, coord); \
-    readImage2DArray(data1, input1, coord); \
+    READ_IMAGEUI_2DARRAY(data0, input0, coord); \
+    READ_IMAGEUI_2DARRAY(data1, input1, coord); \
 \
    float4 src0 = convert_float4(data0) * input0Scale - input0Tail; \
    float4 src1 = convert_float4(data1) * input1Scale - input1Tail; \
@ -139,8 +139,8 @@ __kernel void func_name##_I32I32toBOOL8 \
 \
    int4 src0; \
    int4 src1; \
-    readImage2DArray(src0, input0, coord); \
-    readImage2DArray(src1, input1, coord); \
+    READ_IMAGEI_2DARRAY(src0, input0, coord); \
+    READ_IMAGEI_2DARRAY(src1, input1, coord); \
 \
    int4 dst = (src0)comp_op(src1); \
    dst &= 1; \
--- a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update.cl
@ -0,0 +1,45 @@
+
+#define SCATTER_ND_UPDATE(src0_type, data_type, read_func, write_func) \
+__kernel void scatter_nd_update_##src0_type##src0_type##to##src0_type( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __read_only image2d_t   input2, \
+    __write_only image2d_t  output, \
+    int offsetX, \
+    int offsetY, \
+    int offsetZ, \
+    int offsetW, \
+    int offset_idx, \
+    int coord_dim, \
+    int index_num \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int cnt = 0; \
+ \
+    data_type sum = (data_type)(0, 0, 0, 0); \
+    Image img1 = create_image_from_image2d(input1, 4); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    for(int i = 0; i < index_num; i++) \
+    { \
+        int4 indice = vload4(0, index_ptr + offset_idx); \
+        index_ptr += coord_dim; \
+        int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \
+        if(gidy == idx) \
+        { \
+            data_type data = read_func(input2, (int2)(gidx, i)); \
+            cnt++; \
+            sum += data; \
+        } \
+    } \
+    int2 coord = (int2)(gidx, gidy); \
+    if(cnt == 0) \
+    { \
+        sum = read_func(input0, coord); \
+    } \
+    write_func(output, coord, sum); \
+}
+SCATTER_ND_UPDATE(U32,  uint4,  read_imageui, write_imageui)
+SCATTER_ND_UPDATE(I32,  int4,   read_imagei,  write_imagei)
+SCATTER_ND_UPDATE(F32,  float4, read_imagef,  write_imagef)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/select.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/select.cl
@ -12,9 +12,9 @@ __kernel void select_I8_U8_U8toU8(
    int4  value;
    uint4 src0, src1, src, dst;
    float inputScale, inputTail;
-    readImage2DArray(value, condition, coord);
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEI_2DARRAY(value, condition, coord);
+    READ_IMAGEF_2DARRAY(src0, input0, coord);
+    READ_IMAGEF_2DARRAY(src1, input1, coord);
    src   = (value != 0 ? src0 : src1);
    inputScale = (value.x != 0 ? input0Scale : input1Scale);
    inputTail  = (value.x != 0 ? input0Tail  : input1Tail);
@ -56,9 +56,9 @@ __kernel void select_I8_I32_I32toI32(
    int4 coord  = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4  value;
    int4 src0, src1, dst;
-    readImage2DArray(value, condition, coord);
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEI_2DARRAY(value, condition, coord);
+    READ_IMAGEI_2DARRAY(src0, input0, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);
    dst   = (value != 0 ? src0 : src1);
    write_imagei(output, coord, dst);
 }
@ -94,9 +94,9 @@ __kernel void select_I8_F32_F32toF32(
    int4 coord  = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4  value;
    float4 src0, src1, dst;
-    readImage2DArray(value, condition, coord);
-    readImage2DArray(src0, input0, coord);
-    readImage2DArray(src1, input1, coord);
+    READ_IMAGEI_2DARRAY(value, condition, coord);
+    READ_IMAGEF_2DARRAY(src0, input0, coord);
+    READ_IMAGEF_2DARRAY(src1, input1, coord);
    dst   = (value != 0 ? src0 : src1);
    write_imagef(output, coord, dst);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/cl/signal_frame.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/signal_frame.cl
@ -0,0 +1,21 @@
+
+#define SIGNAL_FRAME_SH_IMPL(type, data_type, read_imagefunc, write_imagefunc) \
+__kernel void signal_frame_##type##to##type \
+    ( \
+    __read_only  image2d_t       input, \
+    __write_only image2d_array_t output, \
+                 int             frame_step \
+    ) \
+{ \
+    int inner = get_global_id(0); \
+    int length_k = get_global_id(1); \
+    int frames_id = get_global_id(2); \
+ \
+    int4 coord = (int4)(inner, length_k, frames_id, frames_id); \
+    int2 coord_in = (int2)(inner, frames_id * frame_step + length_k); \
+ \
+    data_type src = read_imagefunc(input, coord_in); \
+    write_imagefunc(output, coord, src); \
+}
+SIGNAL_FRAME_SH_IMPL(F32, float4, read_imagef,  write_imagef)
+SIGNAL_FRAME_SH_IMPL(U8,  uint4,  read_imageui, write_imageui)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl
@ -1,5 +1,5 @@

-#define TILE_3D(name0, name1, data_type, write_image_func) \
+#define TILE_3D(name0, name1, data_type, read_image_func, write_image_func) \
 __kernel void tile_##name0##to##name1 \
    ( \
    __read_only  image2d_array_t input, \
@ -19,7 +19,7 @@ __kernel void tile_##name0##to##name1 \
    int height = get_image_height(input); \
 \
    data_type src; \
-    readImage2DArray(src, input, coord); \
+    read_image_func(src, input, coord); \
 \
    int batch_id = (short)coord.z / (short)depthIn; \
    coord.z = (short)coord.z % (short)depthIn; \
@ -46,11 +46,11 @@ __kernel void tile_##name0##to##name1 \
        } \
    } \
 }
-TILE_3D(I32, I32, int4,   write_imagei)
-TILE_3D(U32, U32, uint4,  write_imageui)
-TILE_3D(F32, F32, float4, write_imagef)
+TILE_3D(I32, I32, int4,   READ_IMAGEI_2DARRAY,  write_imagei)
+TILE_3D(U32, U32, uint4,  READ_IMAGEUI_2DARRAY, write_imageui)
+TILE_3D(F32, F32, float4, READ_IMAGEF_2DARRAY,  write_imagef)

-#define TILE_2D(name0, name1, data_type) \
+#define TILE_2D(name0, name1, data_type, read_image_func, write_image_func) \
 __kernel void tile_##name0##to##name1##_2D \
    ( \
    __read_only  image2d_t input, \
@ -70,23 +70,22 @@ __kernel void tile_##name0##to##name1##_2D \
    int output_width = get_image_width(output); \
    int output_height = get_image_height(output); \
 \
-    data_type src; \
-    readImage(src, input, coord); \
+    data_type src = read_image_func(input, coord); \
 \
    do \
    { \
        do \
        { \
-            writeImage(output, coord, src); \
+            write_image_func(output, coord, src); \
            coord.x += width; \
        } while (coord.x < output_width); \
        coord.x = get_global_id(0); \
        coord.y += height; \
    } while (coord.y < output_height); \
 }
-TILE_2D(I32, I32, int4)
-TILE_2D(U32, U32, uint4)
-TILE_2D(F32, F32, float4)
+TILE_2D(I32, I32, int4,   read_imagei,  write_imagei)
+TILE_2D(U32, U32, uint4,  read_imageui, write_imageui)
+TILE_2D(F32, F32, float4, read_imagef,  write_imagef)



--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c
@ -1,511 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <float.h>
-#include <math.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "utils/vsi_nn_link_list.h"
-#include "libnnext/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_BOX_WITH_NMS_LIMIT)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_BOX_WITH_NMS_LIMIT)
-#define _VX_KERNEL_NAME         (VX_KERNEL_NAME_BOX_WITH_NMS_LIMIT)
-#define _VX_KERNEL_FUNC_KERNEL  (vxBox_with_nms_limitKernel)
-
-static float hard_nms_kernel
-    (
-    float iou,
-    float iouThreshold
-    )
-{
-    return iou < iouThreshold ? 1.0f : 0.0f;
-}
-
-static float linear_nms_kernel
-    (
-    float iou,
-    float iouThreshold
-    )
-{
-    return iou < iouThreshold ? 1.0f : 1.0f - iou;
-}
-
-static float gaussian_nms_kernel
-    (
-    float iou,
-    float sigma
-    )
-{
-    return (float)(exp(-1.0f * iou * iou / sigma));
-}
-
-void swap_element
-    (
-    uint32_t* list,
-    uint32_t first,
-    uint32_t second
-    )
-{
-    uint32_t temp = list[first];
-    list[first] = list[second];
-    list[second] = temp;
-}
-
-uint32_t max_element
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    )
-{
-    uint32_t i;
-    uint32_t max_index = 0;
-    float max_val = data[index_list[0]];
-    for(i = 1; i < len; i++)
-    {
-        float val = data[index_list[i]];
-        if (max_val < val)
-        {
-            max_val = val;
-            max_index = i;
-        }
-    }
-    return max_index;
-}
-
-static uint32_t max_comp_func
-    (
-    void* data,
-    int32_t left,
-    int32_t right
-    )
-{
-    float* fdata = (float*)data;
-    return fdata[left] >= fdata[right];
-}
-
-void sort_element_by_score
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    )
-{
-    vsi_nn_partition(data, 0, len - 1, max_comp_func, TRUE, index_list);
-}
-
-typedef struct
-{
-    float* fdata;
-    uint32_t numClasses;
-} class_comp_param;
-
-static uint32_t class_comp_func
-    (
-    void* data,
-    int32_t left,
-    int32_t right
-    )
-{
-    class_comp_param *p = (class_comp_param*)data;
-    float* fdata = p->fdata;
-    uint32_t numClasses = p->numClasses;
-    uint32_t lhsClass = left % numClasses, rhsClass = right % numClasses;
-    return lhsClass == rhsClass ? fdata[left] > fdata[right]
-                : lhsClass < rhsClass;
-}
-
-static void sort_element_by_class
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len,
-    uint32_t numClasses
-    )
-{
-    class_comp_param class_comp;
-    class_comp.fdata = data;
-    class_comp.numClasses = numClasses;
-    vsi_nn_partition(&class_comp, 0, len - 1, class_comp_func, TRUE, index_list);
-}
-
-// Taking two indices of bounding boxes, return the intersection-of-union.
-float getIoUAxisAligned
-    (
-    const float* roi1,
-    const float* roi2
-    )
-{
-    const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]);
-    const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]);
-    const float x1 = vsi_nn_max(roi1[0], roi2[0]);
-    const float x2 = vsi_nn_min(roi1[2], roi2[2]);
-    const float y1 = vsi_nn_max(roi1[1], roi2[1]);
-    const float y2 = vsi_nn_min(roi1[3], roi2[3]);
-    const float w = vsi_nn_max(x2 - x1, 0.0f);
-    const float h = vsi_nn_max(y2 - y1, 0.0f);
-    const float areaIntersect = w * h;
-    const float areaUnion = area1 + area2 - areaIntersect;
-    return areaIntersect / areaUnion;
-}
-
-static vsi_status VX_CALLBACK vxBox_with_nms_limitKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-#define ARG_NUM            (5)
-#define TENSOR_NUM_INPUT (3)
-#define TENSOR_NUM_OUTPUT (4)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VSI_FAILURE;
-    vx_context context = NULL;
-    vx_tensor input[TENSOR_NUM_INPUT] = {0};
-    vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
-    float *f32_in_buffer[TENSOR_NUM_INPUT] = {0};
-    int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0};
-    float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
-    int32_t* int32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
-    vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT];
-    vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
-    uint32_t in_elements[TENSOR_NUM_INPUT] = {0};
-    uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
-
-    float scoreThreshold;
-    int32_t maxNumDetections;
-    int32_t nms_kernel_method;
-    float iou_threshold;
-    float sigma;
-    float nms_score_threshold;
-
-    uint32_t i = 0;
-    for(i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    /* prepare data */
-    context = vxGetContext((vx_reference)node);
-
-    for(i = 0; i < TENSOR_NUM_INPUT; i ++)
-    {
-        input[i] = (vx_tensor)paramObj[i];
-        status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]);
-        if (i == 2)
-        {
-            int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context,
-                input[i], &in_attr[i]);
-        }
-        else
-        {
-            f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float));
-            status = vsi_nn_vxConvertTensorToFloat32Data(
-                context, input[i], &in_attr[i], f32_in_buffer[i],
-                in_elements[i] * sizeof(float));
-            TEST_CHECK_STATUS(status, final);
-        }
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i ++)
-    {
-        output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
-        status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
-        if (i < 2)
-        {
-            f32_out_buffer[i] = (float *)malloc(out_elements[i] * sizeof(float));
-            memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float));
-        }
-        else
-        {
-            int32_out_buffer[i] = (int32_t *)malloc(out_elements[i] * sizeof(int32_t));
-            memset(int32_out_buffer[i], 0, out_elements[i] * sizeof(int32_t));
-        }
-    }
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(scoreThreshold),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(maxNumDetections),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(nms_kernel_method),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(iou_threshold),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(sigma),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(nms_score_threshold),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    /* TODO: Add CPU kernel implement */
-    {
-        uint32_t j, n, b, c;
-        const uint32_t kRoiDim = 4;
-        uint32_t numRois = in_attr[0].size[1];
-        uint32_t numClasses = in_attr[0].size[0];
-        int32_t ind;
-
-        uint32_t * batch_data = (uint32_t*)malloc(numRois * sizeof(uint32_t));
-        int32_t numBatch = 0;
-        uint32_t * select = NULL;
-        uint32_t select_size = 0;
-        uint32_t scores_index = 0;
-        uint32_t roi_index = 0;
-        uint32_t roi_out_index = 0;
-
-        memset(batch_data, 0, numRois * sizeof(uint32_t));
-        for (i = 0, ind = -1; i < numRois; i++)
-        {
-            if (int32_in_buffer[2][i] != ind)
-            {
-                ind = int32_in_buffer[2][i];
-                numBatch++;
-            }
-            batch_data[numBatch - 1]++;
-        }
-        select = (uint32_t*)malloc(numBatch * numRois
-            * numClasses * sizeof(uint32_t));
-        memset(select, 0, numBatch * numRois * numClasses * sizeof(uint32_t));
-        for (n = 0; n < (uint32_t)numBatch; n++)
-        {
-            int32_t numDetections_batch = 0;
-            uint32_t select_start_batch = select_size;
-            uint32_t select_len = 0;
-            // Exclude class 0 (background)
-            for (c = 1; c < numClasses; c++)
-            {
-                uint32_t select_start = select_size;
-                int32_t maxNumDetections0 = maxNumDetections;
-                uint32_t numDetections = 0;
-                for (b = 0; b < batch_data[n]; b++)
-                {
-                    uint32_t index = b * numClasses + c;
-                    float score = f32_in_buffer[0][scores_index + index];
-                    if (score > scoreThreshold) {
-                        select[select_size] = index;
-                        select_size++;
-                    }
-                }
-                select_len = select_size - select_start;
-
-                if (maxNumDetections0 < 0)
-                {
-                    maxNumDetections0 = select_len;
-                }
-
-                for (j = 0; (j < select_len && numDetections < (uint32_t)maxNumDetections0); j++)
-                {
-                    // find max score and swap to the front.
-                    int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]),
-                        &(select[select_start + j]), select_len - j) + j;
-
-                    swap_element(&(select[select_start]), max_index, j);
-
-                    // Calculate IoU of the rest, swap to the end (disgard) if needed.
-                    for (i = j + 1; i < select_len; i++)
-                    {
-                        int32_t roiBase0 = roi_index + select[select_start + i] * kRoiDim;
-                        int32_t roiBase1 = roi_index + select[select_start + j] * kRoiDim;
-                        float iou = getIoUAxisAligned(&(f32_in_buffer[1][roiBase0]),
-                            &(f32_in_buffer[1][roiBase1]));
-                        float kernel_iou;
-                        if (nms_kernel_method == 0)
-                        {
-                            kernel_iou = hard_nms_kernel(iou, iou_threshold);
-                        }
-                        else if (nms_kernel_method == 1)
-                        {
-                            kernel_iou = linear_nms_kernel(iou, iou_threshold);
-                        }
-                        else
-                        {
-                            kernel_iou = gaussian_nms_kernel(iou, sigma);
-
-                        }
-                        f32_in_buffer[0][scores_index + select[select_start + i]] *= kernel_iou;
-                        if (f32_in_buffer[0][scores_index + select[select_start + i]] < nms_score_threshold)
-                        {
-                            swap_element(&(select[select_start]), i, select_len - 1);
-                            i--;
-                            select_len--;
-                        }
-                    }
-                    numDetections++;
-                }
-                select_size = select_start + select_len;
-                numDetections_batch += numDetections;
-            }
-
-            // Take top maxNumDetections.
-            sort_element_by_score(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
-                numDetections_batch);
-
-            if (numDetections_batch > maxNumDetections)
-            {
-                select_size = select_start_batch + maxNumDetections;
-            }
-            select_len = select_size - select_start_batch;
-            // Sort again by class.
-            sort_element_by_class(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
-                select_len, numClasses);
-
-            for (i = 0; i < select_len; i++)
-            {
-                int32_t in_index0 = scores_index + select[select_start_batch + i];
-                int32_t in_index1 = roi_index + select[select_start_batch + i] * kRoiDim;
-                f32_out_buffer[0][roi_out_index] = f32_in_buffer[0][in_index0];
-                memcpy(&(f32_out_buffer[1][roi_out_index * kRoiDim]),
-                    &f32_in_buffer[1][in_index1], kRoiDim * sizeof(float));
-                int32_out_buffer[2][roi_out_index] = select[select_start_batch + i] % numClasses;
-                int32_out_buffer[3][roi_out_index] = n;
-                roi_out_index++;
-            }
-
-            scores_index += batch_data[n] * numClasses;
-            roi_index += batch_data[n] * numClasses * kRoiDim;
-        }
-        if (batch_data) free(batch_data);
-        if (select) free(select);
-    }
-
-    /* save data */
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        if (i < 2)
-        {
-            status = vsi_nn_vxConvertFloat32DataToTensor(
-                context, output[i], &out_attr[i], f32_out_buffer[i],
-                out_elements[i] * sizeof(float));
-            TEST_CHECK_STATUS(status, final);
-        }
-        else
-        {
-            vsi_nn_vxCopyDataToTensor(context, output[i], &out_attr[i],
-                (uint8_t *)int32_out_buffer[i]);
-        }
-    }
-
-final:
-    for (i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        if (f32_in_buffer[i]) free(f32_in_buffer[i]);
-        if (int32_in_buffer[i]) free(int32_in_buffer[i]);
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        if (f32_out_buffer[i]) free(f32_out_buffer[i]);
-        if (int32_out_buffer[i]) free(int32_out_buffer[i]);
-    }
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t vxBox_with_nms_limitKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-vx_status VX_CALLBACK vxBox_with_nms_limitInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-    vx_status status = VX_SUCCESS;
-    /*TODO: Add initial code for VX program*/
-
-    return status;
-}
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxBox_with_nms_limit_CPU =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    vxBox_with_nms_limitKernelParam,
-    _cnt_of_array( vxBox_with_nms_limitKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxBox_with_nms_limit_VX =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    NULL,
-    vxBox_with_nms_limitKernelParam,
-    _cnt_of_array( vxBox_with_nms_limitKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxBox_with_nms_limitInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_BOX_WITH_NMS_LIMIT_list[] =
-{
-    &vxBox_with_nms_limit_CPU,
-    &vxBox_with_nms_limit_VX,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c
@ -1,250 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "libnnext/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_EXTRA_ENDING)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_EXTRA_ENDING)
-#define _VX_KERNEL_FUNC_KERNEL  (vxExtra_endingKernel)
-
-static vsi_status VX_CALLBACK vxExtra_endingKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-#define TENSOR_NUM_INPUT (2)
-#define TENSOR_NUM_OUTPUT (1)
-
-    vsi_status status = VSI_FAILURE;
-    vx_context context = NULL;
-    vx_tensor input = NULL;
-    vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
-    uint8_t *u8_in_buffer[1] = {0};
-    uint8_t *u8_out_buffer[TENSOR_NUM_OUTPUT] = {0};
-    vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
-    uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
-    vsi_nn_tensor_attr_t in_attr;
-
-    int32_t i = 0;
-
-    memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    /* prepare data */
-    context = vxGetContext((vx_reference)node);
-
-    input = (vx_tensor)paramObj[1];
-    status = vsi_nn_vxGetTensorAttr(input, &in_attr);
-    TEST_CHECK_STATUS(status, final);
-
-    for(i = 0; i < 1; i ++)
-    {
-        output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
-        status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
-        u8_out_buffer[i]= (uint8_t *)malloc(out_elements[i] * sizeof(uint8_t));
-        memset(u8_out_buffer[i], 0, out_elements[i] * sizeof(uint8_t));
-
-        u8_in_buffer[0] = vsi_nn_vxCopyTensorToData(context, input, &in_attr);
-        memcpy(u8_out_buffer[0], u8_in_buffer[0], out_elements[i] * sizeof(uint8_t));
-    }
-
-    /* save data */
-    status = vsi_nn_vxCopyDataToTensor(context, output[0], &out_attr[0], u8_out_buffer[0]);
-    TEST_CHECK_STATUS(status, final);
-
-final:
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        if (u8_out_buffer[i]) free(u8_out_buffer[i]);
-    }
-    if (u8_in_buffer[0]) free(u8_in_buffer[0]);
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t vxExtra_endingKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-vx_status VX_CALLBACK vxExtra_endingInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-    vx_status status = VX_SUCCESS;
-// Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        3,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-
-    vx_tensor  output           = (vx_tensor)paramObj[2];
-
-    vx_uint32 width             = 0;
-    vx_uint32 height            = 0;
-    vx_uint32 channel           = 0;
-    vx_uint32  dst_size[4]   = {1, 1, 1, 1};
-    vsi_nn_tensor_attr_t attr;
-    uint32_t i;
-    uint32_t output_dims;
-
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status  = vsi_nn_vxGetTensorAttr(output, &attr);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-    output_dims  = attr.dim_num;
-    for (i = 0; i < output_dims; i++)
-    {
-        dst_size[i] = attr.size[i];
-    }
-
-    width = dst_size[0];
-    height = dst_size[1];
-    channel = dst_size[2];
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkOffset[2] = 0;
-    shaderParam.globalWorkScale[0]  = 8;
-    shaderParam.globalWorkScale[1]  = 1;
-    shaderParam.globalWorkScale[2]  = 1;
-    shaderParam.localWorkSize[0]    = 16;
-    shaderParam.localWorkSize[1]    = 1;
-    shaderParam.localWorkSize[2]    = 1;
-    shaderParam.globalWorkSize[0]   = gcmALIGN((width + shaderParam.globalWorkScale[0] - 1)
-                                        / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
-    shaderParam.globalWorkSize[1]   = gcmALIGN((height + shaderParam.globalWorkScale[1] - 1)
-                                        / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
-    shaderParam.globalWorkSize[2]   = channel;
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-                                    &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    if(status < 0)
-        VSILOGE("error-%s,%d\n",__FILE__,__LINE__);
-
-    return status;
-}
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxExtra_ending_CPU =
-{
-    _VX_KERNEL_ID,
-    VX_KERNEL_NAME_EXTRA_ENDING_I16,
-    _VX_KERNEL_FUNC_KERNEL,
-    vxExtra_endingKernelParam,
-    _cnt_of_array( vxExtra_endingKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxExtra_ending_i16 =
-{
-    _VX_KERNEL_ID,
-    VX_KERNEL_NAME_EXTRA_ENDING_I16,
-    NULL,
-    vxExtra_endingKernelParam,
-    _cnt_of_array( vxExtra_endingKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxExtra_endingInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxExtra_ending_i8 =
-{
-    _VX_KERNEL_ID,
-    VX_KERNEL_NAME_EXTRA_ENDING_I8,
-    NULL,
-    vxExtra_endingKernelParam,
-    _cnt_of_array( vxExtra_endingKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxExtra_endingInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxExtra_ending_u8 =
-{
-    _VX_KERNEL_ID,
-    VX_KERNEL_NAME_EXTRA_ENDING_U8,
-    NULL,
-    vxExtra_endingKernelParam,
-    _cnt_of_array( vxExtra_endingKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxExtra_endingInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_EXTRA_ENDING_list[] =
-{
-    &vxExtra_ending_CPU,
-    &vxExtra_ending_i16,
-    &vxExtra_ending_i8,
-    &vxExtra_ending_u8,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c
@ -1,322 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <float.h>
-#include <math.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "libnnext/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_HEATMAP_MAX_KEYPOINT)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_HEATMAP_MAX_KEYPOINT)
-#define _VX_KERNEL_NAME         (VX_KERNEL_NAME_HEATMAP_MAX_KEYPOINT)
-#define _VX_KERNEL_FUNC_KERNEL  (vxHeatmap_max_keypointKernel)
-
-// This function uses Taylor expansion up to the quatratic term to approximate bicubic
-// upscaling result.
-// 2nd order Taylor expansion: D(x) = D - b'x + 1/2 * x'Ax
-// where D = grid[1][1], Taylor expansion center, the original score,
-//       x = delta, the correction on max keypoint position,
-//       D(x) = deltaScore, the accuracy score after correction
-static void solveForDelta
-    (
-    const float grid[3][3],
-    float* delta,
-    float* deltaScore,
-    float fpAtol,
-    float fpRtol
-    )
-{
-    // b: negative 1st order derivative at center
-    // A: Hessian matrix at center (2nd order derivative)
-    float A[2][2], b[2];
-    float crossProd1, crossProd2;
-    float detA;
-    b[0] = -(grid[1][2] - grid[1][0]) / 2.0f;
-    b[1] = -(grid[2][1] - grid[0][1]) / 2.0f;
-    A[0][0] = grid[1][0] - 2.0f * grid[1][1] + grid[1][2];
-    A[0][1] = (grid[2][2] - grid[2][0] - grid[0][2] + grid[0][0]) / 4.0f;
-    A[1][0] = A[0][1];
-    A[1][1] = grid[0][1] - 2.0f * grid[1][1] + grid[2][1];
-
-    // solve Ax=b, where x=delta -> delta = inv(A) * b
-    crossProd1 = A[0][0] * A[1][1];
-    crossProd2 = A[0][1] * A[1][0];
-    detA = crossProd1 - crossProd2;
-    // check if A is invertible
-    if (fabs(detA) < (fpAtol + fpRtol * crossProd1)) return;
-    delta[0] = (A[1][1] * b[0] - A[0][1] * b[1]) / detA;
-    delta[1] = (A[0][0] * b[1] - A[1][0] * b[0]) / detA;
-
-    // clip out of range delta, i.e. delta > 3/2
-    if (fabs(delta[0]) > 1.5f || fabs(delta[1]) > 1.5f)
-    {
-        float scale = (float)(1.5f / vsi_nn_max(fabs(delta[0]), fabs(delta[1])));
-        delta[0] *= scale;
-        delta[1] *= scale;
-    }
-
-    *deltaScore = grid[1][1] - b[0] * delta[0] - b[1] * delta[1] +
-                  ((A[0][0] * delta[0] + A[0][1] * delta[1]) * delta[0] +
-                   (A[1][0] * delta[0] + A[1][1] * delta[1]) * delta[1]) /
-                          2.0f;
-}
-
-static vsi_status VX_CALLBACK vxHeatmap_max_keypointKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-#define ARG_NUM            (1)
-#define TENSOR_NUM_INPUT (2)
-#define TENSOR_NUM_OUTPUT (2)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VSI_FAILURE;
-    vx_context context = NULL;
-    vx_tensor input[TENSOR_NUM_INPUT] = {0};
-    vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
-    float *f32_in_buffer[TENSOR_NUM_INPUT] = {0};
-    float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
-    vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT];
-    vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
-    uint32_t in_elements[TENSOR_NUM_INPUT] = {0};
-    uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
-
-    int32_t type;
-
-    uint32_t i = 0;
-    for(i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    /* prepare data */
-    context = vxGetContext((vx_reference)node);
-
-    for(i = 0; i < TENSOR_NUM_INPUT; i ++)
-    {
-        input[i] = (vx_tensor)paramObj[i];
-        status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]);
-        f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float));
-        status = vsi_nn_vxConvertTensorToFloat32Data(
-            context, input[i], &in_attr[i], f32_in_buffer[i],
-            in_elements[i] * sizeof(float));
-        TEST_CHECK_STATUS(status, final);
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i ++)
-    {
-        output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
-        status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
-        f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float));
-        memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float));
-    }
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(type),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    /* TODO: Add CPU kernel implement */
-    {
-        uint32_t j, k;
-        uint32_t numBoxes = in_attr[0].size[3];
-        uint32_t heatmapSize = in_attr[0].size[2];
-        uint32_t numKeypoints = in_attr[0].size[0];
-        uint32_t boxInfoLength = 4;
-        uint32_t output_score_index = 0;
-        uint32_t output_keypoint_index = 0;
-
-        for(i = 0; i < numBoxes; i++)
-        {
-            for (j = 0; j < numKeypoints; j++)
-            {
-                uint32_t maxIndex = 0;
-                float maxScore = -FLT_MAX;
-                uint32_t maxIndexWidth;
-                uint32_t maxIndexHeight;
-                float localGrid[3][3];
-                int32_t dh, dw;
-                float delta[2] = {0.0f, 0.0f}, deltaScore;
-                float wRoiStart = f32_in_buffer[1][i * boxInfoLength];
-                float hRoiStart = f32_in_buffer[1][i * boxInfoLength + 1];
-                float wRoiEnd = f32_in_buffer[1][i * boxInfoLength + 2];
-                float hRoiEnd = f32_in_buffer[1][i * boxInfoLength + 3];
-                float roiWidth = wRoiEnd - wRoiStart;
-                float roiHeight = hRoiEnd - hRoiStart;
-                float wRelativePos;
-                float hRelativePos;
-                for (k = 0; k < heatmapSize * heatmapSize; k++)
-                {
-                    uint32_t index = i * heatmapSize * heatmapSize * numKeypoints
-                        + k * numKeypoints + j;
-                    float val = f32_in_buffer[0][index];
-                    if (maxScore < val)
-                    {
-                        maxScore = val;
-                        maxIndex = k;
-                    }
-                }
-                maxIndexWidth = maxIndex % heatmapSize;
-                maxIndexHeight = maxIndex / heatmapSize;
-
-                // get local 3x3 grid
-                for (dh = -1; dh <= 1; dh++)
-                {
-                    for (dw = -1; dw <= 1; dw++)
-                    {
-                        // cast uint32_t to int32_t
-                        int32_t h = (int32_t)(maxIndexHeight) + dh;
-                        int32_t w = (int32_t)(maxIndexWidth) + dw;
-                        uint32_t heatmapIndex;
-
-                        // use mirroring for out of bound indexing
-                        // need to ensure heatmapSize >= 2
-                        h = h < 0 ? 1 : (h >= (int32_t)heatmapSize ? heatmapSize - 2 : h);
-                        w = w < 0 ? 1 : (w >= (int32_t)heatmapSize ? heatmapSize - 2 : w);
-
-                        heatmapIndex = i * heatmapSize * heatmapSize * numKeypoints +
-                            (uint32_t)(h) * heatmapSize * numKeypoints +
-                            (uint32_t)(w) * numKeypoints + j;
-                        localGrid[dh + 1][dw + 1] = f32_in_buffer[0][heatmapIndex];
-                    }
-                }
-                deltaScore = maxScore;
-                solveForDelta((const float (*)[3])localGrid, delta, &deltaScore, 1e-3f, 1e-3f);
-
-                wRelativePos = ((float)(maxIndexWidth) + delta[0] + 0.5f) /
-                    (float)(heatmapSize);
-                hRelativePos = ((float)(maxIndexHeight) + delta[1] + 0.5f) /
-                    (float)(heatmapSize);
-                f32_out_buffer[0][output_score_index] = deltaScore;
-                f32_out_buffer[1][output_keypoint_index] = wRelativePos * roiWidth + wRoiStart;
-                f32_out_buffer[1][output_keypoint_index + 1] = hRelativePos * roiHeight + hRoiStart;
-                output_score_index++;
-                output_keypoint_index +=2;
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        status = vsi_nn_vxConvertFloat32DataToTensor(
-            context, output[i], &out_attr[i], f32_out_buffer[i],
-            out_elements[i] * sizeof(float));
-        TEST_CHECK_STATUS(status, final);
-    }
-
-final:
-    for (i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        if (f32_in_buffer[i]) free(f32_in_buffer[i]);
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        if (f32_out_buffer[i]) free(f32_out_buffer[i]);
-    }
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t vxHeatmap_max_keypointKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-vx_status VX_CALLBACK vxHeatmap_max_keypointInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-    vx_status status = VX_SUCCESS;
-    /*TODO: Add initial code for VX program*/
-
-    return status;
-}
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxHeatmap_max_keypoint_CPU =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    vxHeatmap_max_keypointKernelParam,
-    _cnt_of_array( vxHeatmap_max_keypointKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxHeatmap_max_keypoint_VX =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    NULL,
-    vxHeatmap_max_keypointKernelParam,
-    _cnt_of_array( vxHeatmap_max_keypointKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxHeatmap_max_keypointInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_HEATMAP_MAX_KEYPOINT_list[] =
-{
-    &vxHeatmap_max_keypoint_CPU,
-    &vxHeatmap_max_keypoint_VX,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c
@ -1,806 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "libnnext/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define INPUT_FP16 0
-#define OUTPUT_FP16 0
-
-vx_status getFactor(vx_uint32 data, vx_uint32 *factor, vx_uint32 minLimit, vx_uint32 maxLimit, vx_uint32 alignData)
-{
-    vx_uint32 i         = 0;
-    vx_uint32 maxFactor = alignData - 1;
-    vx_status status    = VX_FAILURE;
-
-    for (i = minLimit; i <= maxLimit; i ++)
-    {
-        if (data % i == 0)
-        {
-            if (status == VX_FAILURE && data % i == 0)
-            {
-                *factor      = i;
-                maxFactor    = i;
-                status       = VX_SUCCESS;
-                continue;
-            }
-            else if ((i % alignData) < (maxFactor % alignData))
-            {
-               *factor      = i;
-               maxFactor    = i;
-               status       = VX_SUCCESS;
-            }
-        }
-    }
-
-    return status;
-}
-
-void mySignalFrameFunc
-    (
-    void* imgIn,
-    void* imgOut,
-    uint32_t input_dim,
-    uint32_t width,
-    uint32_t height,
-    uint32_t channel,
-    uint32_t batch,
-    uint32_t frame_len, // window size
-    uint32_t step,
-    uint32_t pad_end,
-    uint32_t pad_val,
-    uint32_t axis,
-    uint32_t *dstW,
-    uint32_t *dstH,
-    uint32_t *dstC,
-    uint32_t *dstB
-    )
-{
-    uint8_t* tmpIn = (uint8_t*)imgIn;
-    uint8_t* tmpOut = (uint8_t*)imgOut;
-
-    uint32_t i,j,k;
-    uint32_t size = 0;
-    uint32_t iter = 0;
-
-    if(input_dim == 1)
-    {
-        if(axis != 0)
-        {
-            VSILOGE("error.\n");
-            return;
-        }
-        *dstW = frame_len;
-        //*dstH = (len - frame_len) / step  + 1;
-        *dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step  + 1);
-        *dstC = 1;
-        *dstB = 1;
-
-        size = (*dstW) * sizeof(int16_t);
-        iter = pad_end ? width : (width - frame_len + 1);
-        if(pad_end)
-        {
-            int16_t* output = (int16_t*)tmpOut;
-            int16_t* input = (int16_t*)tmpIn;
-            uint32_t m = 0;
-            for(i = 0, j = 0; i < iter; i += step)
-            {
-                for(m = i; m < frame_len + i; m++)
-                {
-                    if(m >= width)
-                    {
-                        output[j] = 0;
-                    }
-                    else
-                    {
-                        output[j] = input[m];
-                    }
-                    j++;
-                }
-            }
-        }
-        else
-        {
-            for(i = 0, j = 0; i < iter; i += step, j++)
-            {
-                memcpy(tmpOut + j * size, tmpIn + i * sizeof(int16_t), size);
-            }
-        }
-    }
-    else if(input_dim == 2)
-    {
-        if(axis == 0)
-        {
-            uint8_t* src = tmpIn;
-            uint8_t* dst = tmpOut;
-
-            *dstH = frame_len;
-            *dstW = width;
-            *dstC = pad_end ? ((height + step - 1) / step) : ((height - frame_len) / step  + 1);
-
-            *dstB = 1;
-
-            size = width * frame_len * sizeof(int16_t);
-            iter = pad_end ? (height) : (height - frame_len + 1);
-            if(pad_end)
-            {
-                uint32_t m = 0;
-                size = width * sizeof(int16_t);
-                for(i = 0, j = 0; i < iter; i += step)
-                {
-                    for(m = i; m < frame_len + i; m++)
-                    {
-                        if(m >= height)
-                        {
-                            memset(dst + j * size, 0, size);
-                        }
-                        else
-                        {
-                            memcpy(dst + j * size, src + m * width * sizeof(int16_t), size);
-                        }
-                        j++;
-                    }
-                }
-            }
-            else
-            {
-                for(i = 0, j = 0; i < iter; i += step, j++)
-                {
-                    memcpy(dst + j * size, src + i * width * sizeof(int16_t), size);
-                }
-            }
-        }
-        else if(axis == 1)
-        {
-            *dstW = frame_len;
-
-            //*dstH = (len - frame_len) / step  + 1;
-            *dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step  + 1);
-
-            *dstC = height;
-            *dstB = 1;
-
-            size = (*dstW) * sizeof(int16_t);
-            iter = pad_end ? width : (width - frame_len + 1);
-            if(pad_end)
-            {
-                for(k = 0; k < height; k++)
-                {
-                    uint8_t* src = tmpIn + k * width * sizeof(int16_t);
-                    uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t);
-
-                    int16_t* output = (int16_t*)dst;
-                    int16_t* input = (int16_t*)src;
-                    uint32_t m = 0;
-                    for(i = 0, j = 0; i < iter; i += step)
-                    {
-                        for(m = i; m < frame_len + i; m++)
-                        {
-                            if(m >= width)
-                            {
-                                output[j] = 0;
-                            }
-                            else
-                            {
-                                output[j] = input[m];
-                            }
-                            j++;
-                        }
-                    }
-                }
-            }
-            else
-            {
-                for(k = 0; k < height; k++)
-                {
-                    uint8_t* src = tmpIn + k * width * sizeof(int16_t);
-                    uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t);
-
-                    for(i = 0, j = 0; i < iter; i += step, j++)
-                    {
-                        memcpy(dst + j * size, src + i * sizeof(int16_t), size);
-                    }
-                }
-            }
-        }
-    }
-    else if(input_dim == 3)
-    {
-        if(axis == 0)
-        {
-            uint8_t* src = tmpIn;
-            uint8_t* dst = tmpOut;
-            size = width * height * frame_len * sizeof(int16_t);
-
-            *dstW = width;
-            *dstH = height;
-            *dstC = frame_len;
-            *dstB = pad_end ? ((channel + step - 1) / step) :((channel - frame_len) / step  + 1);
-            iter = pad_end ? channel : (channel - frame_len + 1);
-            if(pad_end)
-            {
-                uint32_t m = 0;
-                size = width * height * sizeof(int16_t);
-                for(i = 0, j = 0; i < iter; i += step)
-                {
-                    for(m = i; m < frame_len + i; m++)
-                    {
-                        if(m >= channel)
-                        {
-                            memset(dst + j * size, 0 , size);
-                        }
-                        else
-                        {
-                            memcpy(dst + j * size, src + m * width * height * sizeof(int16_t), size);
-                        }
-                        j++;
-                    }
-                }
-            }
-            else
-            {
-                for(i = 0, j = 0; i < iter; i += step, j++)
-                {
-                    memcpy(dst + j * size, src + i * width * height * sizeof(int16_t), size);
-                }
-            }
-        }
-        else if(axis == 1)
-        {
-            *dstH = frame_len;
-            *dstW = width;
-            *dstC = pad_end ? ((height + step - 1) / step) : ((height - frame_len) / step  + 1);
-            *dstB = channel;
-
-            size = width * frame_len * sizeof(int16_t);
-            iter = pad_end ? (height) : (height - frame_len + 1);
-            if(pad_end)
-            {
-                uint32_t m = 0;
-                size = width * sizeof(int16_t);
-                for(k = 0; k < channel; k++)
-                {
-                    uint8_t* src = tmpIn + k * width * height* sizeof(int16_t);
-                    uint8_t* dst = tmpOut + k * (*dstC) * (*dstW) * (*dstH) * sizeof(int16_t);
-
-                    for(i = 0, j = 0; i < iter; i += step)
-                    {
-                        for(m = i; m < frame_len + i; m++)
-                        {
-                            if(m >= height)
-                            {
-                                memset(dst + j * size, 0, size);
-                            }
-                            else
-                            {
-                                memcpy(dst + j * size, src + m * width * sizeof(int16_t), size);
-                            }
-                            j++;
-                        }
-                    }
-                }
-            }
-            else
-            {
-                for(k = 0; k < channel; k++)
-                {
-                    uint8_t* src = tmpIn + k * width * height* sizeof(int16_t);
-                    uint8_t* dst = tmpOut + k * (*dstC) * (*dstW) * (*dstH) * sizeof(int16_t);
-
-                    for(i = 0, j = 0; i < iter; i += step, j++)
-                    {
-                        memcpy(dst + j * size, src + i * width * sizeof(int16_t), size);
-                    }
-                }
-            }
-        }
-        else if(axis == 2)
-        {
-            //*dstH = (len - frame_len) / step  + 1;
-            *dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step  + 1);
-            *dstW = frame_len;
-            *dstC = height;
-            *dstB = channel;
-
-            size = (*dstW) * sizeof(int16_t);
-            iter = pad_end ? width : (width - frame_len + 1);
-
-            if(pad_end)
-            {
-                for(k = 0; k < channel * height; k++)
-                {
-                    uint8_t* src = tmpIn + k * width * sizeof(int16_t);
-                    uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t);
-
-                    int16_t* output = (int16_t*)dst;
-                    int16_t* input = (int16_t*)src;
-                    uint32_t m = 0;
-                    for(i = 0, j = 0; i < iter; i += step)
-                    {
-                        for(m = i; m < frame_len + i; m++)
-                        {
-                            if(m >= width)
-                            {
-                                output[j] = 0;
-                            }
-                            else
-                            {
-                                output[j] = input[m];
-                            }
-                            j++;
-                        }
-                    }
-                }
-            }
-            else
-            {
-                for(k = 0; k < channel * height; k++)
-                {
-                    uint8_t* src = tmpIn + k * width * sizeof(int16_t);
-                    uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t);
-                    for(i = 0, j = 0; i < iter; i += step, j++)
-                    {
-                        memcpy(dst + j * size, src + i * sizeof(int16_t), size);
-                    }
-                }
-            }
-        }
-    }
-
-    return;
-}
-
-vsi_status VX_CALLBACK vxSignalFrameKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if(paramNum == 7)
-    {
-        vx_context context = NULL;
-        // tensor
-        vx_tensor imgObj[7] = { NULL };
-#if INPUT_FP16
-        int16_t *input = NULL;
-#else
-        uint8_t *input = NULL;
-#endif
-#if OUTPUT_FP16
-        int16_t *output = NULL;
-#else
-        uint8_t *output = NULL;
-#endif
-
-        uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}, output_size[DIM_SIZE] = {1, 1, 1, 1}, dst_size[DIM_SIZE] = {1, 1, 1, 1};
-        vsi_nn_tensor_attr_t in_attr, out_attr;
-
-        vsi_nn_type_e outputFormat = VSI_NN_TYPE_FLOAT16;
-        uint32_t input_dims = 0, output_dims = 0, tmpDim = 0;
-
-        vx_scalar scalar[5] = { NULL };
-        uint32_t frame_length = 0, step = 0, pad_end = 0, pad = 0, axis = 0, axis0 = 0;
-        uint32_t i  = 0;
-
-        memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
-        memset(&out_attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
-        status  = vsi_nn_vxGetTensorAttr(imgObj[0], &in_attr);
-        status |= vsi_nn_vxGetTensorAttr(imgObj[1], &out_attr);
-        if (status != VX_SUCCESS)
-        {
-            VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-
-        imgObj[0] = (vx_tensor)paramObj[0];
-        imgObj[1] = (vx_tensor)paramObj[1];  //output
-        scalar[0] = (vx_scalar)paramObj[2];
-        scalar[1] = (vx_scalar)paramObj[3];
-        scalar[2] = (vx_scalar)paramObj[4];
-        scalar[3] = (vx_scalar)paramObj[5];
-        scalar[4] = (vx_scalar)paramObj[6];
-        context = vxGetContext((vx_reference)node);
-        if (context == NULL)
-        {
-            VSILOGE("vxGetContext failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-        //input
-        input_dims  = in_attr.dim_num;
-        for (i = 0; i < input_dims; i++)
-        {
-            input_size[i] = in_attr.size[i];
-        }
-
-        //output
-        output_dims  = out_attr.dim_num;
-        outputFormat = out_attr.dtype.vx_type;
-        for (i = 0; i < output_dims; i++)
-        {
-            output_size[i] = out_attr.size[i];
-        }
-
-        input_size[2] = (input_dims <= 2)?1:input_size[2];
-        input_size[3] = (input_dims <= 3)?1:input_size[3];
-
-
-#if INPUT_FP16
-        input  = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t));
-#else
-        //input  = (uint8_t*)malloc(input_size[0]*input_size[1]*input_size[2]*vsi_nn_GetTypeBytes(inputFormat));
-#endif
-#if OUTPUT_FP16
-        output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t));
-#else
-        output = (uint8_t*)malloc(output_size[0]*output_size[1]*output_size[2]*vsi_nn_GetTypeBytes(outputFormat));
-#endif
-
-        input = vsi_nn_vxCopyTensorToData(context, imgObj[0], &in_attr);
-
-        // scalar
-        status = vxCopyScalar(scalar[0], &frame_length, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        status |= vxCopyScalar(scalar[1], &step, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        status |= vxCopyScalar(scalar[2], &pad_end, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        status |= vxCopyScalar(scalar[3], &pad, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        status |= vxCopyScalar(scalar[4], &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        if (status != VX_SUCCESS)
-        {
-            VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-
-        // Call C Prototype
-        if(output_dims == 2)
-            tmpDim = 1;
-        else
-            tmpDim = input_dims;
-        {
-            axis0 = input_dims - axis - 1;
-        }
-        mySignalFrameFunc(input, output, tmpDim, input_size[0],
-            input_size[1], input_size[2], input_size[3],
-            frame_length, step, pad_end, pad, axis0,
-            &dst_size[0], &dst_size[1], &dst_size[2], &dst_size[3]);
-
-        //output tensor
-        status = vsi_nn_vxCopyDataToTensor(context, imgObj[1], &out_attr, output);
-        if (status != VX_SUCCESS)
-        {
-            VSILOGE("vsi_nn_vxCopyDataToTensor failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-
-OnError:
-        if(input) free(input);
-        if(output) free(output);
-    }
-
-    return status;
-}
-
-vsi_status VX_CALLBACK vxSignalFrameInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        3,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-    vx_scalar     scalar[5];
-    vx_tensor     input           = (vx_tensor)paramObj[0];
-    vx_tensor     output          = (vx_tensor)paramObj[1];
-
-    uint32_t      input_size[DIM_SIZE]   = {1, 1, 1, 1};
-    uint32_t      input_dims      = 0;
-    uint32_t      output_dims      = 0;
-    //vx_uint32 factor = 1;
-    //vx_uint32 maxWorkGroupSize = 8;
-    uint32_t frame_length, step, pad_end, pad, axis, axis0;
-    uint32_t output_channel = 0;
-
-    vx_uint32  i        = 0;
-    vsi_nn_tensor_attr_t attr[2];
-    memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-    status  = vsi_nn_vxGetTensorAttr(input, &attr[0]);
-    status |= vsi_nn_vxGetTensorAttr(output, &attr[1]);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-
-    input_dims       = attr[0].dim_num;
-    for (i = 0; i < input_dims; i++)
-    {
-        input_size[i] = attr[0].size[i];
-    }
-    output_dims       = attr[1].dim_num;
-
-    scalar[0]            = (vx_scalar)paramObj[2];
-    scalar[1]            = (vx_scalar)paramObj[3];
-    scalar[2]            = (vx_scalar)paramObj[4];
-    scalar[3]            = (vx_scalar)paramObj[5];
-    scalar[4]            = (vx_scalar)paramObj[6];
-
-    status = vxCopyScalar(scalar[0], &frame_length, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    status |= vxCopyScalar(scalar[1], &step, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    status |= vxCopyScalar(scalar[2], &pad_end, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    status |= vxCopyScalar(scalar[3], &pad, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    status |= vxCopyScalar(scalar[4], &axis0, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__);
-        return status;
-    }
-
-    {
-        if(input_dims == 2 && output_dims == 2)
-        {
-            axis = input_dims - axis0 - 2;
-        }
-        else
-        {
-            axis = input_dims - axis0 - 1;
-        }
-    }
-
-    input_size[2] = (input_dims <= 2)?1:input_size[2];
-    //input_size[2] = (input_dims == 4)?(input_size[2] * input_size[3]):input_size[2];
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkOffset[2] = 0;
-    if((output_dims == 2)
-        || (input_dims == 2 && output_dims == 3 && axis == 1)
-        || (input_dims == 3 && axis == 2))
-    {
-        shaderParam.globalWorkScale[0]  = 1;
-        shaderParam.globalWorkScale[1]  = 1;
-        shaderParam.globalWorkScale[2]  = 1;
-        shaderParam.localWorkSize[0]    = 1;
-        shaderParam.localWorkSize[1]    = 1;
-#if 0
-        if (input_size[1] <= maxWorkGroupSize)
-            shaderParam.localWorkSize[1]    = input_size[1];
-        else if (getFactor(input_size[1], &factor, 2, maxWorkGroupSize, 8) == VX_SUCCESS)
-            shaderParam.localWorkSize[1]    = factor;
-        else
-            shaderParam.localWorkSize[1]    = 1;
-#endif
-
-        shaderParam.localWorkSize[2]    = 1;
-        shaderParam.globalWorkSize[0]   = gcmALIGN((1 + shaderParam.globalWorkScale[0] - 1)
-            / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
-
-        shaderParam.globalWorkSize[1]   = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1)
-            / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
-        //shaderParam.globalWorkSize[1]   = input_size[1];
-        shaderParam.globalWorkSize[2]   = gcmALIGN((input_size[2] + shaderParam.globalWorkScale[2] - 1)
-            / shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]);
-    }
-    else if((input_dims == 2 && output_dims == 3 && axis == 0)
-        || (input_dims == 3 && axis == 1))
-    {
-        int height = (pad_end == 0) ? (input_size[1] - frame_length + 1) : (input_size[1]);
-        shaderParam.globalWorkScale[0]  = 8;
-        shaderParam.globalWorkScale[1]  = step;
-        shaderParam.globalWorkScale[2]  = 1;
-        shaderParam.localWorkSize[0]    = 1;
-        shaderParam.localWorkSize[1]    = 1;
-        shaderParam.localWorkSize[2]    = 1;
-        shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-            / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
-        shaderParam.globalWorkSize[1]   = gcmALIGN((height + shaderParam.globalWorkScale[1] - 1)
-            / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
-        shaderParam.globalWorkSize[2]   = gcmALIGN((input_size[2] + shaderParam.globalWorkScale[2] - 1)
-            / shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]);
-
-        output_channel = (pad_end == 0) ? ((input_size[1] - frame_length) / step  + 1) : ((input_size[1] + step - 1) / step);
-    }
-    else if(input_dims == 3 && axis == 0)
-    {
-        int channel = (pad_end == 0) ? (input_size[2] - frame_length + 1) : (input_size[2]);
-        shaderParam.globalWorkScale[0]  = 8;
-        shaderParam.globalWorkScale[1]  = 1;
-        shaderParam.globalWorkScale[2]  = step;
-        shaderParam.localWorkSize[0]    = 1;
-        shaderParam.localWorkSize[1]    = 1;
-        shaderParam.localWorkSize[2]    = 1;
-        shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-            / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
-        shaderParam.globalWorkSize[1]   = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1)
-            / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
-        shaderParam.globalWorkSize[2]   = gcmALIGN((channel + shaderParam.globalWorkScale[2] - 1)
-            / shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]);
-    }
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-    if(status < 0)
-    {
-        VSILOGE("[%s : %d]Initializer  failure! \n",__FILE__, __LINE__);
-    }
-    {
-        status |= vxSetNodeUniform(nodObj, "input_width", 1, &input_size[0]);
-        status |= vxSetNodeUniform(nodObj, "input_height", 1, &input_size[1]);
-        status |= vxSetNodeUniform(nodObj, "input_channel", 1, &input_size[2]);
-        status |= vxSetNodeUniform(nodObj, "output_channel", 1, &output_channel);
-        if(status < 0)
-        {
-            VSILOGE("[%s : %d]Initializer  failure! \n",__FILE__, __LINE__);
-        }
-    }
-    return status;
-}
-static vx_param_description_t vxSignalFrameKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxSignalFrameKernelInfo =
-{
-    VX_KERNEL_ENUM_SIGNALFRAME,
-    VX_KERNEL_NAME_SIGNALFRAME_WIDTH,
-    NULL,
-    vxSignalFrameKernelParam,
-    (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxSignalFrameInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxSignalFrameKernelInfo_height =
-{
-    VX_KERNEL_ENUM_SIGNALFRAME,
-    VX_KERNEL_NAME_SIGNALFRAME_HEIGHT,
-    NULL,
-    vxSignalFrameKernelParam,
-    (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxSignalFrameInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxSignalFrameKernelInfo_channel =
-{
-    VX_KERNEL_ENUM_SIGNALFRAME,
-    VX_KERNEL_NAME_SIGNALFRAME_CHANNEL,
-    NULL,
-    vxSignalFrameKernelParam,
-    (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxSignalFrameInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxSignalFrameKernelInfo_8bit =
-{
-    VX_KERNEL_ENUM_SIGNALFRAME,
-    VX_KERNEL_NAME_SIGNALFRAME_WIDTH_8BITS,
-    NULL,
-    vxSignalFrameKernelParam,
-    (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxSignalFrameInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxSignalFrameKernelInfo_height_8bit =
-{
-    VX_KERNEL_ENUM_SIGNALFRAME,
-    VX_KERNEL_NAME_SIGNALFRAME_HEIGHT_8BITS,
-    NULL,
-    vxSignalFrameKernelParam,
-    (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxSignalFrameInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxSignalFrameKernelInfo_channel_8bit =
-{
-    VX_KERNEL_ENUM_SIGNALFRAME,
-    VX_KERNEL_NAME_SIGNALFRAME_CHANNEL_8BITS,
-    NULL,
-    vxSignalFrameKernelParam,
-    (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxSignalFrameInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxSignalFrameKernelInfo_CPU =
-{
-    VX_KERNEL_ENUM_SIGNALFRAME,
-    VX_KERNEL_NAME_SIGNALFRAME_WIDTH,
-    vxSignalFrameKernel,
-    vxSignalFrameKernelParam,
-    (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_SIGNALFRAME_list[] =
-{
-    &vxSignalFrameKernelInfo_CPU,
-    &vxSignalFrameKernelInfo,
-    &vxSignalFrameKernelInfo_height,
-    &vxSignalFrameKernelInfo_channel,
-    &vxSignalFrameKernelInfo_8bit,
-    &vxSignalFrameKernelInfo_height_8bit,
-    &vxSignalFrameKernelInfo_channel_8bit,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c
@ -1,481 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "libnnext/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_SPATIAL_TRANSFORMER)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_SPATIAL_TRANSFORMER)
-#define _VX_KERNEL_NAME         (VX_KERNEL_NAME_SPATIAL_TRANSFORMER)
-#define _VX_KERNEL_FUNC_KERNEL  (vxSpatial_transformerKernel)
-
-
-static vsi_status VX_CALLBACK vxSpatial_transformerKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    /*To do cpu implementation*/
-    vsi_status status = VX_SUCCESS;
-
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-};
-
-vx_status VX_CALLBACK vxTransform_GemmInputValidator(vx_node node, vx_uint32 index)
-{
-    return VX_SUCCESS;
-}
-
-vx_status VX_CALLBACK vxTransform_GemmOutputValidator(vx_node node, vx_uint32 index, vx_meta_format metaObj)
-{
-    return VX_SUCCESS;
-}
-
-vx_status VX_CALLBACK vxValidator(vx_node node, const vx_reference parameters[],
-                                    vx_uint32 num, vx_meta_format metas[])
-{
-    vx_status status = VX_SUCCESS;
-    vx_uint32 index = 0;
-    for(index = 0; index < num; index++)
-    {
-        if(index < 2)
-        {
-            status |= vxTransform_GemmInputValidator(node,index);
-        }
-        else
-        {
-            status |= vxTransform_GemmOutputValidator(node,index,metas[index]);
-        }
-    }
-    return status;
-}
-
-static vx_param_description_t vxTransform_GemmKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-vx_status VX_CALLBACK vxTransform_GemmInitializer(vx_node nodObj, const vx_reference *paramObj, vx_uint32 paraNum)
-{
-// Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-#define gcmMIN(x, y)            (((x) <= (y)) ?  (x) :  (y))
-#define gcmMAX(x, y)            (((x) >= (y)) ?  (x) :  (y))
-#define MAX_MULTIPLIER_NUM      (65535)
-#define MAX_POST_SHIFT_BITS     (31)
-    vx_kernel_execution_parameters_t shaderParam = {
-        2,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-
-    vx_status    status             = VX_SUCCESS;
-    vx_tensor    input0             = (vx_tensor)paramObj[0];
-    vx_tensor    input1             = (vx_tensor)paramObj[1];
-    vx_tensor    output             = (vx_tensor)paramObj[2];
-    vx_enum      src0Format         = VSI_NN_TYPE_FLOAT16;
-    vx_enum      src1Format         = VSI_NN_TYPE_FLOAT16;
-    vx_enum      dstFormat          = VSI_NN_TYPE_FLOAT16;
-    vx_uint32    coord_size[4]      = {1, 1, 1, 1};
-    vx_uint32    i     = 0;
-    vsi_nn_tensor_attr_t attr[3];
-
-    memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t));
-
-    status  = vsi_nn_vxGetTensorAttr(input0, &attr[0]);
-    status |= vsi_nn_vxGetTensorAttr(input1, &attr[1]);
-    status |= vsi_nn_vxGetTensorAttr(output, &attr[2]);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-
-    src0Format       = attr[0].dtype.vx_type;
-    src1Format       = attr[1].dtype.vx_type;
-    for (i = 0; i < attr[1].dim_num; i++)
-    {
-        coord_size[i] = attr[1].size[i];
-    }
-    dstFormat        = attr[2].dtype.vx_type;
-
-    if (src0Format == VSI_NN_TYPE_FLOAT16 && src1Format == VSI_NN_TYPE_FLOAT16 && dstFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        shaderParam.globalWorkScale[0]  = 12;
-        shaderParam.globalWorkScale[1]  = 1;
-    }
-
-    shaderParam.globalWorkSize[0]   =
-                gcmALIGN((coord_size[0] + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], 4);
-    shaderParam.globalWorkSize[1]   =
-                (coord_size[1] + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1];
-    {
-        vx_uint32 uniGemm3x3_4x4[16] = {
-            0x15151515, // TCfg
-            0x00000000, // ASelt
-            0x02100210, 0x05430543, // ABin
-            0x15151515, // BSelt
-            0x05430210, 0x05430210, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        };
-
-        vxSetNodeUniform(nodObj, "uniGemm3x3_4x4", 1, uniGemm3x3_4x4);
-    }
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-                    &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    return VX_SUCCESS;
-}
-
-static vx_param_description_t vxTransform_setupThresKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-vx_status VX_CALLBACK vxTransform_setupThresInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-// Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-#define gcmMIN(x, y)            (((x) <= (y)) ?  (x) :  (y))
-#define gcmMAX(x, y)            (((x) >= (y)) ?  (x) :  (y))
-#define MAX_MULTIPLIER_NUM      (65535)
-#define MAX_POST_SHIFT_BITS     (31)
-    vx_kernel_execution_parameters_t shaderParam = {
-        2,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-
-    vx_status    status             = VX_SUCCESS;
-    vx_scalar    thresFlag_s        = (vx_scalar)paramObj[2];
-    vx_enum      src0Format         = VSI_NN_TYPE_FLOAT16;
-    vx_enum      src1Format         = VSI_NN_TYPE_FLOAT16;
-
-    vx_int32     thresFlag          = 0;
-    vx_uint32    extract_packed[4]  = {0};
-
-    vxCopyScalar(thresFlag_s, &thresFlag, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    if(status < 0)
-        VSILOGE("error-%s,%d\n",__FILE__,__LINE__);
-
-    shaderParam.globalWorkScale[0]  = 1;
-    shaderParam.globalWorkScale[1]  = 1;
-    shaderParam.localWorkSize[0]    = 1;
-    shaderParam.localWorkSize[1]    = 1;
-    shaderParam.globalWorkSize[0]   = 1;
-    shaderParam.globalWorkSize[1]   = 1;
-
-    if (src0Format == src1Format && src0Format == VSI_NN_TYPE_FLOAT16)
-    {
-        vx_uint32 i = 0;
-        vx_uint32 j = 0;
-        for (i = 0; i < 4; i++)
-        {
-            if (thresFlag & (1 << i))
-            {
-                extract_packed[0] |= ((i << 4) << (i * 8));
-            }
-            else
-            {
-                extract_packed[0] |= (((j << 4) + 128) << (i * 8));
-                j ++;
-            }
-        }
-
-        for (i = 4; i < 6; i++)
-        {
-            if (thresFlag & (1 << i))
-            {
-                extract_packed[1] |= ((i << 4) << (i * 8 - 32));
-            }
-            else
-            {
-                extract_packed[1] |= (((j << 4) + 128) << (i * 8 - 32));
-                j ++;
-            }
-        }
-
-        extract_packed[2] = extract_packed[3] = 0x10101010;
-    }
-
-    vxSetNodeUniform(nodObj, "extract_packed", 1, extract_packed);
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-                                &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    return VX_SUCCESS;
-}
-
-
-static vx_param_description_t vxTransform_InterPKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-
-vx_status VX_CALLBACK vxTransform_InterPInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-// Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-#define gcmMIN(x, y)            (((x) <= (y)) ?  (x) :  (y))
-#define gcmMAX(x, y)            (((x) >= (y)) ?  (x) :  (y))
-#define MAX_MULTIPLIER_NUM      (65535)
-#define MAX_POST_SHIFT_BITS     (31)
-    vx_kernel_execution_parameters_t shaderParam = {
-        2,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-
-    vx_status    status             = VX_SUCCESS;
-    vx_tensor    input0             = (vx_tensor)paramObj[0];
-    vx_tensor    input1             = (vx_tensor)paramObj[1];
-    vx_tensor    output             = (vx_tensor)paramObj[2];
-    vx_enum      src0Format         = VSI_NN_TYPE_FLOAT16;
-    vx_enum      src1Format         = VSI_NN_TYPE_FLOAT16;
-    vx_enum      dstFormat          = VSI_NN_TYPE_FLOAT16;
-    vx_uint32    coord_size[4]      = {1, 1, 1, 1};
-    vx_uint32    input_size[4]      = {1, 1, 1, 1};
-    vx_uint32    output_size[4]     = {1, 1, 1, 1};
-    vx_uint32    i     = 0;
-    vsi_nn_tensor_attr_t attr[3];
-
-    memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t));
-
-    status  = vsi_nn_vxGetTensorAttr(input0, &attr[0]);
-    status |= vsi_nn_vxGetTensorAttr(input1, &attr[1]);
-    status |= vsi_nn_vxGetTensorAttr(output, &attr[2]);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-
-    for (i = 0; i < attr[0].dim_num; i++)
-    {
-        input_size[i] = attr[0].size[i];
-    }
-    src0Format       = attr[0].dtype.vx_type;
-    src1Format       = attr[1].dtype.vx_type;
-    for (i = 0; i < attr[1].dim_num; i++)
-    {
-        coord_size[i] = attr[1].size[i];
-    }
-    dstFormat        = attr[2].dtype.vx_type;
-    for (i = 0; i < attr[2].dim_num; i++)
-    {
-        output_size[i] = attr[2].size[i];
-    }
-
-    if ((src0Format == VSI_NN_TYPE_FLOAT16 && src1Format == VSI_NN_TYPE_FLOAT16 && dstFormat == VSI_NN_TYPE_FLOAT16)
-     || (src0Format == VSI_NN_TYPE_INT16 && src1Format == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_INT16))
-    {
-        shaderParam.globalWorkScale[0]  = 2;
-        shaderParam.globalWorkScale[1]  = 1;
-    }
-
-    shaderParam.globalWorkSize[0]   =
-                gcmALIGN((coord_size[0] + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], 4);
-    shaderParam.globalWorkSize[1]   =
-                (coord_size[1] + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1];
-    {
-        vx_int32 packedWH2[2]   = {input_size[0], input_size[1]};
-        vx_int32 packedWH       = (input_size[1] << 16) | (input_size[0] & 0xFFFF);
-        vx_uint32 uniGetDXY_4x4[16] = {
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00100001, 0x00010010, // ABin
-            0x09090909, // BSelt
-            0x00010000, 0x00000001, // BBin
-            0x00000101, // AccumType, ConstantType, and PostShift
-            0x3c000000, 0x00000000, 0x3c000000, 0x00000000,
-            0x3c000000, 0x00000000, 0x3c000000, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvertF16toF32_4x4[16] = {
-            0x01010101, // TCfg
-            0x01010000, // ASelt
-            0x00010000, 0x00010000, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        };
-
-        vxSetNodeUniform(nodObj, "uniGetDXY_4x4", 1, uniGetDXY_4x4);
-        vxSetNodeUniform(nodObj, "uniConvertF16toF32_4x4", 1, uniConvertF16toF32_4x4);
-
-        //packedWH2[0]   = input_size[0];
-        //packedWH2[1]   = input_size[1];
-        //packedWH       = (input_size[1] << 16) | (input_size[0] & 0xFFFF);
-        vxSetNodeUniform(nodObj, "packedWH2", 1, packedWH2);
-        vxSetNodeUniform(nodObj, "packedWH", 1, &packedWH);
-    }
-    if (output_size[2] > 1)
-    {
-        vxSetNodeUniform(nodObj, "depth", 1, &output_size[2]);
-    }
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-                                &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    return VX_SUCCESS;
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxSpatial_transformer_CPU =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTransform_GemmKernelInfo_F16toF16 =
-{
-    VX_KERNEL_ENUM_SPATIAL_TRANSFORMER,
-    VX_KERNEL_NAME_SPATIAL_TRANSFORMER,
-    NULL,
-    vxTransform_GemmKernelParam,
-    (sizeof(vxTransform_GemmKernelParam) / sizeof(vxTransform_GemmKernelParam[0])),
-    vxValidator,
-    NULL,
-    NULL,
-    vxTransform_GemmInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTransform_setupThresKernelInfo_F16toF16 =
-{
-    VX_KERNEL_ENUM_SPATIAL_TRANSFORMER,
-    VX_KERNEL_NAME_TRANSFORM_SETUP_THRES_F16TOF16,
-    NULL,
-    vxTransform_setupThresKernelParam,
-    (sizeof(vxTransform_setupThresKernelParam) / sizeof(vxTransform_setupThresKernelParam[0])),
-    vxValidator,
-    NULL,
-    NULL,
-    vxTransform_setupThresInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTransform_InterPKernelInfo_F16toF16_2D =
-{
-    VX_KERNEL_ENUM_SPATIAL_TRANSFORMER,
-    VX_KERNEL_NAME_TRANSFORM_INTERP_F16TOF16_2D,
-    NULL,
-    vxTransform_InterPKernelParam,
-    (sizeof(vxTransform_InterPKernelParam) / sizeof(vxTransform_InterPKernelParam[0])),
-    vxValidator,
-    NULL,
-    NULL,
-    vxTransform_InterPInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTransform_InterPKernelInfo_F16toF16 =
-{
-    VX_KERNEL_ENUM_SPATIAL_TRANSFORMER,
-    VX_KERNEL_NAME_TRANSFORM_INTERP_F16TOF16,
-    NULL,
-    vxTransform_InterPKernelParam,
-    (sizeof(vxTransform_InterPKernelParam) / sizeof(vxTransform_InterPKernelParam[0])),
-    vxValidator,
-    NULL,
-    NULL,
-    vxTransform_InterPInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_SPATIAL_TRANSFORMER_list[] =
-{
-    &vxSpatial_transformer_CPU,
-    &vxTransform_setupThresKernelInfo_F16toF16,
-    &vxTransform_GemmKernelInfo_F16toF16,
-    &vxTransform_InterPKernelInfo_F16toF16_2D,
-    &vxTransform_InterPKernelInfo_F16toF16,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c
@ -1,124 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdlib.h>
-#include <VX/vx_khr_cnn.h>
-#include <VX/vx_helper.h>
-#include <VX/vx.h>
-#include <VX/vx_ext_program.h>
-
-#include "vsi_nn_pub.h"
-#include "utils/vsi_nn_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR_CPU      (vx_client_kernel_cpu_SYNC_HOST)
-#define _VX_KERNEL_ID           KERNEL_ENUM_SYNC_HOST
-#define _VX_KERNEL_NAME         ("com.vivantecorp.extension.Sync_hostVXC")
-#define _VX_KERNEL_FUNC_KERNEL  (vxSync_hostKernel)
-
-static vsi_status VX_CALLBACK vxSync_hostKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status  status = 0;
-    vx_context  context = NULL;
-    vx_tensor   input = NULL;
-    vx_tensor   output = NULL;
-    uint8_t   * in_buffer = NULL;
-    uint32_t    in_stride[8] = { 0 };
-    vx_tensor_addressing in_addr = NULL;
-    vsi_nn_tensor_attr_t in_attr;
-
-    status = VX_SUCCESS;
-    context = vxGetContext( (vx_reference)node );
-    input  = (vx_tensor)paramObj[0];
-    output = (vx_tensor)paramObj[1];
-    memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
-
-    in_buffer = vsi_nn_ConvertRawTensorToData2( context, input,
-                &in_attr, in_stride, &in_addr, VX_READ_ONLY );
-
-    status = vsi_nn_vxCopyDataToTensor(context, output, &in_attr, in_buffer);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxCopyDataToTensor failure! at line %d\n", __LINE__);
-        goto OnError;
-    }
-
-OnError:
-    if( NULL != in_buffer )
-    {
-        free( in_buffer );
-    }
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t s_params[] =
-    {
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    };
-
-vx_status VX_CALLBACK vxSync_hostInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-    vx_status status = VX_SUCCESS;
-    /*TODO: Add initial code for VX program*/
-
-    return status;
-}
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR_CPU =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_SYNC_HOST_list[] =
-{
-    &_VX_KERNEL_VAR_CPU,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c
@ -1,287 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "libnnext/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-void tensorStackConcatFunc
-    (
-    int16_t* dataIn,
-    int16_t* dataIO,
-    int32_t  index,
-    uint32_t width,
-    uint32_t height,
-    uint32_t channel,
-    uint32_t batch
-    )
-{
-    int32_t stride = width * sizeof(int16_t);
-    VSILOGI("Hello tensorStackConcatFunc!\n");
-    memcpy(dataIO + index * width, dataIn, stride);
-    return;
-}
-vsi_status VX_CALLBACK vxTensorStackConcatKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if(paramNum == 3)
-    {
-        vx_context context = NULL;
-        // tensor
-        vx_tensor imgObj[2] = { NULL };
-        vsi_nn_tensor_attr_t attr[2];
-        int16_t *input = NULL, *output = NULL;
-        uint32_t input_size[4] = {1, 1, 1, 1}, output_size[4] = {1, 1, 1, 1};
-        uint32_t input_stride_size[4]  = {1, 1, 1, 1};
-        uint32_t output_stride_size[4] = {1, 1, 1, 1};
-        vx_tensor_addressing input_user_addr = NULL;
-        vx_tensor_addressing output_user_addr = NULL;
-        vsi_nn_type_e inputFormat = VSI_NN_TYPE_FLOAT16, outputFormat = VSI_NN_TYPE_FLOAT16;
-        uint32_t input_dims = 0, output_dims = 0;
-        uint32_t i;
-        // scalar
-        vx_scalar scalar[1] = { NULL };
-        int32_t index = 0;
-
-        status = VX_SUCCESS;
-        imgObj[0] = (vx_tensor)paramObj[0];
-        imgObj[1] = (vx_tensor)paramObj[1];
-        scalar[0] = (vx_scalar)paramObj[2];
-        memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-        memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-        context = vxGetContext((vx_reference)node);
-        if (context == NULL)
-        {
-            VSILOGE("vxGetContext failure! at line %d\n", __LINE__);
-            return status;
-        }
-
-        status  = vsi_nn_vxGetTensorAttr(imgObj[0], &attr[0]);
-        status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]);
-        status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]);
-        if (status != VX_SUCCESS)
-        {
-            VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-            goto final;
-        }
-
-        //input
-        input_dims  = attr[0].dim_num;
-        inputFormat = attr[0].dtype.vx_type;
-        for (i = 0; i < input_dims; i++)
-        {
-            input_size[i] = attr[0].size[i];
-        }
-        //output
-        output_dims  = attr[1].dim_num;
-        outputFormat = attr[1].dtype.vx_type;
-        for (i = 0; i < output_dims; i++)
-        {
-            output_size[i] = attr[1].size[i];
-        }
-
-        input_size[2] = (input_dims <= 2)?1:input_size[2];
-        input_size[3] = (input_dims <= 3)?1:input_size[3];
-        input_stride_size[0]  = vsi_nn_GetTypeBytes(inputFormat);
-        for (i=1; i< input_dims; i++)
-        {
-            input_stride_size[i]  = input_stride_size[i-1] * input_size[i-1];
-        }
-        input  = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t));
-        input_user_addr = vxCreateTensorAddressing(context, input_size, input_stride_size, (vx_uint8)input_dims);
-        vsi_nn_copy_tensor_patch(imgObj[0], &attr[0], input, VX_READ_ONLY);
-        output_stride_size[0] = vsi_nn_GetTypeBytes(outputFormat);
-        for (i=1; i< output_dims; i++)
-        {
-            output_stride_size[i] = output_stride_size[i-1] * output_size[i-1];
-        }
-        output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t));
-        output_user_addr = vxCreateTensorAddressing(context, output_size,
-            output_stride_size, (vx_uint8)output_dims);
-
-        vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], output, VX_READ_ONLY);
-        // scalar
-        status = vxCopyScalar(scalar[0], &index, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        if (status != VX_SUCCESS)
-        {
-            VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__);
-            goto final;
-        }
-        // Call C Prototype
-        tensorStackConcatFunc(input, output, index, input_size[0],
-            input_size[1], input_size[2], input_size[3]);
-        //output tensor
-        vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], output, VX_WRITE_ONLY);
-final:
-        if(input) free(input);
-        if(output) free(output);
-        if(input_user_addr) vxReleaseTensorAddressing(&input_user_addr);
-        if(output_user_addr) vxReleaseTensorAddressing(&output_user_addr);
-    }
-    return status;
-}
-vsi_status VX_CALLBACK vxTensorStackConcatInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        3,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-
-    vx_tensor     input           = (vx_tensor)paramObj[0];
-    uint32_t      input_size[4]   = {1, 1, 1, 1};
-    uint32_t      input_dims      = 0;
-    vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16;
-    vsi_nn_tensor_attr_t attr;
-    uint32_t      i;
-
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status  = vsi_nn_vxGetTensorAttr(input, &attr);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-        return status;
-    }
-
-    input_dims      = attr.dim_num;
-    inputDataFormat = attr.dtype.vx_type;
-    for (i = 0; i < input_dims; i++)
-    {
-        input_size[i] = attr.size[i];
-    }
-    input_size[2] = (input_dims <= 2)?1:input_size[2];
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkOffset[2] = 0;
-    if (inputDataFormat == VSI_NN_TYPE_FLOAT16 || inputDataFormat == VSI_NN_TYPE_INT16)
-        shaderParam.globalWorkScale[0]  = 16;
-    else
-        shaderParam.globalWorkScale[0]  = 32;
-    shaderParam.globalWorkScale[1]  = 1;
-    shaderParam.globalWorkScale[2]  = 1;
-    shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-        / shaderParam.globalWorkScale[0], 4);
-    shaderParam.globalWorkSize[1]   = (input_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1];
-    shaderParam.globalWorkSize[2]   = (input_size[2] + shaderParam.globalWorkScale[2] - 1)
-        / shaderParam.globalWorkScale[2];
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-    if(status < 0)
-    {
-        VSILOGE("[%s : %d]Initializer  failure! \n",__FILE__, __LINE__);
-    }
-    return status;
-}
-static vx_param_description_t vxTensorStackConcatKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxTensorStackConcatKernelInfo =
-{
-    VX_KERNEL_ENUM_TENSORSTACKCONCAT,
-    VX_KERNEL_NAME_TENSORSTACKCONCAT,
-    NULL,
-    vxTensorStackConcatKernelParam,
-    (sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxTensorStackConcatInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTensorStackConcatKernelInfo8Bits =
-{
-    VX_KERNEL_ENUM_TENSORSTACKCONCAT8BITS,
-    VX_KERNEL_NAME_TENSORSTACKCONCAT8BITS,
-    NULL,
-    vxTensorStackConcatKernelParam,
-    (sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxTensorStackConcatInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTensorStackConcatKernelInfo_CPU =
-{
-    VX_KERNEL_ENUM_TENSORSTACKCONCAT,
-    VX_KERNEL_NAME_TENSORSTACKCONCAT,
-    vxTensorStackConcatKernel,
-    vxTensorStackConcatKernelParam,
-    (sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_TENSORSTACKCONCAT_list[] =
-{
-    &vxTensorStackConcatKernelInfo_CPU,
-    &vxTensorStackConcatKernelInfo,
-    &vxTensorStackConcatKernelInfo8Bits,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx
@ -30,24 +30,27 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1( \
    _viv_asm(COPY, mean, _mean, 16); \
    VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, var, _var, 16); \
-    float4 gamma0 = read_imagef(Gamma, coord); \
-    coord.x += 4; \
-    float4 gamma1 = read_imagef(Gamma, coord); \
-    coord.x -= 4; \
-    float4 beta = read_imagef(Beta, coord); \
+    int4 coord_in = coord; \
+    int depth = get_image_array_size(Gamma); \
+    _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \
+    float4 gamma = read_imagef(Gamma, coord_in); \
+    coord_in.z = coord.z; \
+    depth = get_image_array_size(Beta); \
+    _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \
+    float4 beta = read_imagef(Beta, coord_in); \
 \
    float4 src0, src1, m, v; \
    VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
-    gamma0 = gamma0 * rsqrt(v + eps); \
+    float4 gamma0 = gamma.xxxx * rsqrt(v + eps); \
    src0 = src0 * input_scale + input_tail; \
    src0 = (src0 - m) * gamma0 + beta.xxxx; \
    src0 = src0 * output_scale + output_zp; \
    VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
-    gamma1 = gamma1 * rsqrt(v + eps); \
+    float4 gamma1 = gamma.xxxx * rsqrt(v + eps); \
    src1 = src1 * input_scale + input_tail; \
    src1 = (src1 - m) * gamma1 + beta.xxxx; \
    src1 = src1 * output_scale + output_zp; \
@ -95,22 +98,21 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1_2D( \
    _viv_asm(COPY, mean, _mean, 16); \
    VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, var, _var, 16); \
-    float4 gamma0 = read_imagef(Gamma, coord.xy); \
-    float4 gamma1 = read_imagef(Gamma, coord.zy); \
+    float4 gamma = read_imagef(Gamma, coord.xy); \
    float4 beta = read_imagef(Beta, coord.xy); \
 \
    float4 src0, src1, m, v; \
    VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
-    gamma0 = gamma0 * rsqrt(v + eps); \
+    float4 gamma0 = gamma.xxxx * rsqrt(v + eps); \
    src0 = src0 * input_scale + input_tail; \
    src0 = (src0 - m) * gamma0 + beta.xxxx; \
    src0 = src0 * output_scale + output_zp; \
    VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
-    gamma1 = gamma1 * rsqrt(v + eps); \
+    float4 gamma1 = gamma.xxxx * rsqrt(v + eps); \
    src1 = src1 * input_scale + input_tail; \
    src1 = (src1 - m) * gamma1 + beta.xxxx; \
    src1 = src1 * output_scale + output_zp; \
@ -158,12 +160,18 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0( \
    _viv_asm(COPY, mean, _mean, 16); \
    VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, var, _var, 16); \
-    float4 gamma0 = read_imagef(Gamma, coord); \
-    float4 beta0 = read_imagef(Beta, coord); \
-    coord.x += 4; \
-    float4 gamma1 = read_imagef(Gamma, coord); \
-    float4 beta1 = read_imagef(Beta, coord); \
-    coord.x -= 4; \
+    int4 coord_in0 = coord; \
+    int depth = get_image_array_size(Gamma); \
+    _viv_asm(CLAMP0MAX, coord_in0.z, coord_in0.z, depth - 1); \
+    float4 gamma0 = read_imagef(Gamma, coord_in0); \
+    int4 coord_in1 = coord; \
+    depth = get_image_array_size(Beta); \
+    _viv_asm(CLAMP0MAX, coord_in1.z, coord_in1.z, depth - 1); \
+    float4 beta0 = read_imagef(Beta, coord_in1); \
+    coord_in0.x += 4; \
+    coord_in1.x += 4; \
+    float4 gamma1 = read_imagef(Gamma, coord_in0); \
+    float4 beta1 = read_imagef(Beta, coord_in1); \
 \
    float4 src0, src1, m, v; \
    VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
@ -264,4 +272,3 @@ BATCH_NORM_SH_IMPL_AXIS1_2D(U8,  U8,  vxc_uchar16, vxc_uchar16, int4,  vxc_uchar
 BATCH_NORM_SH_IMPL_AXIS1_2D(U8,  F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8,   vxc_ushort8)
 BATCH_NORM_SH_IMPL_AXIS1_2D(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,  vxc_char16)
 BATCH_NORM_SH_IMPL_AXIS1_2D(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)
-
--- a/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx
@ -83,14 +83,6 @@ __kernel void conv1d_U8U8I32toU8_K1024_SMALL(
    VXC_WriteImage(output, coord.wy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }

-inline uchar* get_image2D_array_ptr(image2d_array_t  input)
-{
-    int8 desc;
-    _viv_asm(COPY, desc, input, sizeof(desc));
-    uchar *src_ptr = (uchar*)desc.s0;
-    return src_ptr;
-}
-
 __kernel void conv1d_U8U8I32toU8_K1024_LARGE(
     __read_only image2d_array_t   input,
     __read_only image2d_array_t   weight,
@ -112,9 +104,11 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(
    vxc_short8 w_zp = (short)weight_ZP;
    vxc_uchar16 input_val = 0, weight_val = 0;
    int temp = 0, i, j;
-    uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input);
+    Tensor src_tensor = create_image_from_image2d(input, 1);
+    uchar *src_ptr_base = (uchar *)src_image.ptr;
    uchar *src_ptr;
-    uchar *dst_ptr = (uchar *)get_image2D_array_ptr(output);
+    Tensor dst_tensor = create_image_from_image2d(output, 1);
+    uchar *dst_ptr = (uchar *)dst_tensor.ptr;

    temp = read_imagei(bias, coord.yz).x;
    sum0 = convert_float(temp);
@ -122,7 +116,7 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(

    for (i = 0; i < input_height; i++)
    {
-        src_ptr = src_ptr_base + (coord.x + coord.z * input_width);
+        src_ptr = src_ptr_base + (coord.x + coord.z * src_image.stride_y);
        for (j = 0; j < kernel_cnt_x16; j++)
        {
            VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \
@ -161,7 +155,7 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(
    _viv_asm(CONV_SAT_RTE, result1, sum1);
    vxc_uchar8 result;
    VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);
-    dst_ptr = dst_ptr + (coord.w + coord.y * output_width);
+    dst_ptr = dst_ptr + (coord.w + coord.y * dst_tensor.stride_y);
    VXC_Vstore8(dst_ptr, 0, result);
 }

--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
@ -72,6 +72,56 @@ float4 eltwise_unary_round(float4 x)
    return convert_float4(convert_int4_rte(x));
 }

+#define MUL2_RSQRTPI    (1.1283791670955126f)
+float erf_eval(float x)
+{
+    float res = 0;
+    float tmp = x;
+    float factorial = 1;
+    float x_pow = x;
+    float one = 1.0f;
+    float n = 1;
+
+    if (x <= -3)
+        return -1;
+    else if(x >= 3)
+        return 1;
+
+    while (fabs(tmp) > 1e-5)
+    {
+        res += tmp;
+
+        factorial *= n;
+        one *= -1;
+        x_pow *= x * x;
+        tmp = one / factorial * x_pow / ( 2 * n + 1);
+
+        n += 1.0f;
+    }
+    return res * MUL2_RSQRTPI;
+}
+#define RSQRT2      (0.70710678118654752440084436210485f)
+float4 eltwise_unary_gelu(float4 x)
+{
+    float4 erf, data;
+    data = x * RSQRT2;
+    erf.x = erf_eval(data.x);
+    erf.y = erf_eval(data.y);
+    erf.z = erf_eval(data.z);
+    erf.w = erf_eval(data.w);
+    x = 0.5f * x * (1 + erf);
+
+    return x;
+}
+
+#define SQRT_2_RCP_PI  0.7978845834732056f
+float4 eltwise_unary_hard_gelu(float4 x)
+{
+    float4 cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *
+                        (x + 0.044715f * x * x * x));
+    return x * cdf;
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@ -203,6 +253,28 @@ ELTSISE_UNARY_2D(round, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc
 ELTSISE_UNARY_2D(round, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_2D(round, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
 ELTSISE_UNARY_2D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//GELU
+ELTSISE_UNARY_2D(gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//HARD_GELU
+ELTSISE_UNARY_2D(hard_gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(hard_gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(hard_gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(hard_gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(hard_gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(hard_gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(hard_gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(hard_gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)

 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
@ -252,3 +324,7 @@ ELTSISE_UNARY_BF16_2D(mish)
 ELTSISE_UNARY_BF16_2D(hard_sigmoid)
 //ROUND
 ELTSISE_UNARY_BF16_2D(round)
+//GELU
+ELTSISE_UNARY_BF16_2D(gelu)
+//HARD_GELU
+ELTSISE_UNARY_BF16_2D(hard_gelu)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
@ -72,6 +72,56 @@ float4 eltwise_unary_round(float4 x)
    return convert_float4(convert_int4_rte(x));
 }

+#define MUL2_RSQRTPI    (1.1283791670955126f)
+float erf_eval(float x)
+{
+    float res = 0;
+    float tmp = x;
+    float factorial = 1;
+    float x_pow = x;
+    float one = 1.0f;
+    float n = 1;
+
+    if (x <= -3)
+        return -1;
+    else if(x >= 3)
+        return 1;
+
+    while (fabs(tmp) > 1e-5)
+    {
+        res += tmp;
+
+        factorial *= n;
+        one *= -1;
+        x_pow *= x * x;
+        tmp = one / factorial * x_pow / ( 2 * n + 1);
+
+        n += 1.0f;
+    }
+    return res * MUL2_RSQRTPI;
+}
+#define RSQRT2      (0.70710678118654752440084436210485f)
+float4 eltwise_unary_gelu(float4 x)
+{
+    float4 erf, data;
+    data = x * RSQRT2;
+    erf.x = erf_eval(data.x);
+    erf.y = erf_eval(data.y);
+    erf.z = erf_eval(data.z);
+    erf.w = erf_eval(data.w);
+    x = 0.5f * x * (1 + erf);
+
+    return x;
+}
+
+#define SQRT_2_RCP_PI  0.7978845834732056f
+float4 eltwise_unary_hard_gelu(float4 x)
+{
+    float4 cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *
+                        (x + 0.044715f * x * x * x));
+    return x * cdf;
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@ -203,6 +253,28 @@ ELTSISE_UNARY_3D(round, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc
 ELTSISE_UNARY_3D(round, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_3D(round, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
 ELTSISE_UNARY_3D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//GELU
+ELTSISE_UNARY_3D(gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//HARD_GELU
+ELTSISE_UNARY_3D(hard_gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(hard_gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(hard_gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(hard_gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(hard_gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(hard_gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(hard_gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(hard_gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)

 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
@ -250,4 +322,8 @@ ELTSISE_UNARY_BF16(mish)
 //HARD_SIGMOID
 ELTSISE_UNARY_BF16(hard_sigmoid)
 //ROUND
-ELTSISE_UNARY_BF16(round)
+ELTSISE_UNARY_BF16(round)
+//GELU
+ELTSISE_UNARY_BF16(gelu)
+//HARD_GELU
+ELTSISE_UNARY_BF16(hard_gelu)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx
@ -1,8 +1,9 @@
 #include "cl_viv_vx_ext.h"

 #define MUL2_RSQRTPI    (1.1283791670955126f)
-float eltwise_unary_erf(float x)
+float eltwise_unary_erf(float _x)
 {
+    float x = clamp(_x, -2, 2);
    float res = 0;
    float tmp = x;
    float factorial = 1;
--- a/src/tim/vx/internal/src/libnnext/ops/vx/get_matrix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/get_matrix.vx
@ -0,0 +1,185 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float4 theta_1;
+_viv_uniform float4 theta_2;
+_viv_uniform float4 scale;
+_viv_uniform float input_scale;
+_viv_uniform float input_tail;
+
+#define GET_MATRIX_SH_IMPL(name0, in_type, read_func) \
+__kernel void get_matrix_##name0##toF32 \
+    ( \
+    __read_only  image2d_t input, \
+    __write_only image2d_t output, \
+                 int       has_theta_1_1,  \
+                 int       has_theta_1_2,  \
+                 int       has_theta_1_3,  \
+                 int       has_theta_2_1,  \
+                 int       has_theta_2_2,  \
+                 int       has_theta_2_3,  \
+                 float     theta_1_1,  \
+                 float     theta_1_2,  \
+                 float     theta_1_3,  \
+                 float     theta_2_1,  \
+                 float     theta_2_2,  \
+                 float     theta_2_3,  \
+                 float     i_width, \
+                 float     i_height, \
+                 float     o_width, \
+                 float     o_height \
+    ) \
+{ \
+    int2 coord =  (int2)(0, get_global_id(1)); \
+    float4 matrix0, matrix1; \
+    float4 theta1, theta2; \
+    _viv_asm(COPY, theta1, theta_1, 16); \
+    _viv_asm(COPY, theta2, theta_2, 16); \
+ \
+    if (has_theta_1_1 == 0) \
+    { \
+        in_type data = read_func(input, coord); \
+        coord.x ++; \
+        theta1.x = convert_float(data.x) * input_scale + input_tail; \
+    } \
+ \
+    if (has_theta_1_2 == 0) \
+    { \
+        in_type data = read_func(input, coord); \
+        coord.x ++; \
+        theta1.y = convert_float(data.x) * input_scale + input_tail; \
+    } \
+ \
+    if (has_theta_1_3 == 0) \
+    { \
+        in_type data = read_func(input, coord); \
+        coord.x ++; \
+        theta1.z = convert_float(data.x) * input_scale + input_tail; \
+    } \
+ \
+    if (has_theta_2_1 == 0) \
+    { \
+        in_type data = read_func(input, coord); \
+        coord.x ++; \
+        theta2.x = convert_float(data.x) * input_scale + input_tail; \
+    } \
+ \
+    if (has_theta_2_2 == 0) \
+    { \
+        in_type data = read_func(input, coord); \
+        coord.x ++; \
+        theta2.y = convert_float(data.x) * input_scale + input_tail; \
+    } \
+ \
+    if (has_theta_2_3 == 0) \
+    { \
+        in_type data = read_func(input, coord); \
+        coord.x ++; \
+        theta2.z = convert_float(data.x) * input_scale + input_tail; \
+    } \
+ \
+    matrix0.x = theta2.y * scale.x; \
+    matrix0.z = theta2.x * scale.z; \
+    matrix1.x = ( theta2.z - theta2.y - theta2.x + 1) * i_width * 0.5f; \
+    matrix0.y = theta1.y * scale.w; \
+    matrix0.w = theta1.x * scale.y; \
+    matrix1.y = ( theta1.z - theta1.y - theta1.x + 1) * i_height * 0.5f; \
+    matrix1.zw = 2.0f * matrix0.xy; \
+ \
+    coord.x = 0; \
+    vxc_ushort8 dst; \
+    _viv_asm(COPY, dst, matrix0, 16); \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, dst, matrix1, 16); \
+    coord.x = 8; \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+GET_MATRIX_SH_IMPL(I16, int4,  read_imagei)
+GET_MATRIX_SH_IMPL(I8,  int4,  read_imagei)
+GET_MATRIX_SH_IMPL(U8,  uint4, read_imageui)
+
+__kernel void get_matrix_F16toF32
+    (
+    __read_only  image2d_t input,
+    __write_only image2d_t output,
+                 int       has_theta_1_1,
+                 int       has_theta_1_2,
+                 int       has_theta_1_3,
+                 int       has_theta_2_1,
+                 int       has_theta_2_2,
+                 int       has_theta_2_3,
+                 float     theta_1_1,
+                 float     theta_1_2,
+                 float     theta_1_3,
+                 float     theta_2_1,
+                 float     theta_2_2,
+                 float     theta_2_3,
+                 float     i_width,
+                 float     i_height,
+                 float     o_width,
+                 float     o_height
+    )
+{
+    int2 coord =  (int2)(0, get_global_id(1));
+    float4 matrix0, matrix1;
+    float4 theta1, theta2;
+    _viv_asm(COPY, theta1, theta_1, 16);
+    _viv_asm(COPY, theta2, theta_2, 16);
+
+    if (has_theta_1_1 == 0)
+    {
+        float4 data = read_imagef(input, coord);
+        coord.x ++;
+        theta1.x = data.x;
+    }
+
+    if (has_theta_1_2 == 0)
+    {
+        float4 data = read_imagef(input, coord);
+        coord.x ++;
+        theta1.y = data.x;
+    }
+
+    if (has_theta_1_3 == 0)
+    {
+        float4 data = read_imagef(input, coord);
+        coord.x ++;
+        theta1.z = data.x;
+    }
+
+    if (has_theta_2_1 == 0)
+    {
+        float4 data = read_imagef(input, coord);
+        coord.x ++;
+        theta2.x = data.x;
+    }
+
+    if (has_theta_2_2 == 0)
+    {
+        float4 data = read_imagef(input, coord);
+        coord.x ++;
+        theta2.y = data.x;
+    }
+
+    if (has_theta_2_3 == 0)
+    {
+        float4 data = read_imagef(input, coord);
+        coord.x ++;
+        theta2.z = data.x;
+    }
+
+    matrix0.x = theta2.y * scale.x;
+    matrix0.z = theta2.x * scale.z;
+    matrix1.x = ( theta2.z - theta2.y - theta2.x + 1) * i_width * 0.5f;
+    matrix0.y = theta1.y * scale.w;
+    matrix0.w = theta1.x * scale.y;
+    matrix1.y = ( theta1.z - theta1.y - theta1.x + 1) * i_height * 0.5f;
+    matrix1.zw = 2.0f * matrix0.xy;
+
+    coord.x = 0;
+    vxc_ushort8 dst;
+    _viv_asm(COPY, dst, matrix0, 16);
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, dst, matrix1, 16);
+    coord.x = 8;
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx
@ -16,7 +16,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
    int gidx = get_global_id(0) << 3;
    int lidx = get_local_id(0);
    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
+    int4 coord = (int4)(gidx, 0, gidz, gidz);
    vxc_short8 src0;
    vxc_half8 in_h;
    vxc_float4 sumsqr;
@ -133,7 +133,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
    image2d_array_t output, float eps, int rsFlg)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    int4 coord_para = (int4)(gidz, 0, 0, 0);
    vxc_short8 src0;
    vxc_short8 src1;
@ -166,18 +167,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
    int8 input_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr);
+    _viv_asm(MOV, coord.z, baseAddr);

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, in_h, src0, 16);

+    coord_in.y ++;
+
    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
        UniFP16toFP32Lo4_dp4x4);
    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
@ -191,7 +194,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
        uniConvertHalfToFp16_2x8);
    _viv_asm(COPY, outval, dst, 16);
-    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \
+    VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx
@ -26,7 +26,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
    int gidx = get_global_id(0) << 3;
    int lidx = get_local_id(0);
    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
+    int4 coord = (int4)(gidx, 0, gidz, gidz);
    vxc_short8 src0;
    float sum = 0, sqr = 0;
    vxc_float4 sumsqr = (vxc_float4)(0);
@ -43,7 +43,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
    {
        for(coord.y = 0; coord.y < height;)
        {
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+            VXC_OP4(img_load_3d, src0, input, coord, 0, \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
            coord.y++;
            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
@ -106,7 +106,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
    {
        for(; coord.y < endH;)
        {
-            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+            VXC_ReadImage(src0, input, coord, 0,
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
            coord.y++;
            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
@ -154,7 +154,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
              int rsFlg)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    int4 coord_para = (int4)(gidz, 0, 0, 0);
    vxc_short8 src0;
    vxc_short8 src1;
@ -162,7 +163,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
    float scale_vari, bias_val;
    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);

-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, scale_h, src1, 16);
    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
@ -190,16 +191,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
    int8 input_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr);
+    _viv_asm(MOV, coord.z, baseAddr);

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord_in, 0, \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord_in.y ++;
    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
        uniConvertInt16Fp32Fst_4x4);
    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
@ -213,7 +215,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
        uniConvertHalfToFp16_2x8);
    _viv_asm(COPY, outval, dst, 16);
-    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \
+    VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
@ -238,7 +240,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
    float scale_vari, bias_val;
    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);

-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, scale_h, src1, 16);
    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
@ -265,7 +267,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t

    for(; coord.y < endH; coord.y++)
    {
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+    VXC_ReadImage(src0, input, coord.xy, 0,\
        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
@ -294,7 +296,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
              int rsFlg)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    int4 coord_para = (int4)(gidz, 0, 0, 0);
    vxc_short8 src0, src2;
    vxc_short8 src1;
@ -302,7 +305,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
    float scale_vari, bias_val;
    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);

-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, scale_h, src1, 16);
    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
@ -326,15 +329,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
    int8 input_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr);
+    _viv_asm(MOV, coord.z, baseAddr);
    for(coord.y = 0; coord.y < height; coord.y++)
    {
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord_in, 0, \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord_in.y ++;
    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
        uniConvertInt16Fp32Fst_4x4);
    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
@ -346,7 +350,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
    tmpVal1 = convert_int4_rte(norm);
    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
        uniConvertInt32toInt16_2x8);
-    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \
+    VXC_OP4_NoDest(img_store_3d, output, coord, src2, \
                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
@ -371,7 +375,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
    float scale_vari, bias_val;
    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);

-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, scale_h, src1, 16);
    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
@ -394,7 +398,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t

    for(; coord.y < endH; coord.y++)
    {
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+    VXC_ReadImage(src0, input, coord, 0,\
        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
        uniConvertInt16Fp32Fst_4x4);
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx
@ -27,7 +27,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
    int gidx = get_global_id(0) << 4;
    int lidx = get_local_id(0);
    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
+    int4 coord = (int4)(gidx, 0, gidz, gidz);
    vxc_char16 src0;
    float sum = 0, sqr = 0;
    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
    image2d_array_t output, float eps, int rsFlg)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
    int4 coord_para = (int4)(gidz, 0, 0, 0);
    vxc_char16 src0;
    vxc_short8 src1, outval;
@ -277,7 +277,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
    image2d_array_t output, float eps, int rsFlg)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    int4 coord_para = (int4)(gidz, 0, 0, 0);
    vxc_char16 src0, src2;
    vxc_short8 src1;
@ -309,16 +310,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
    int8 input_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr);
+    _viv_asm(MOV, coord.z, baseAddr);

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_in.y ++;
    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
@ -333,7 +335,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
    norm = tmpData3 * alpha + bias_val;
    tmpVal1 = convert_int4_rte(norm);
    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
    }
 }

--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx
@ -25,7 +25,8 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \
    image2d_array_t output, float eps, int rsFlg) \
 { \
    int gidz = get_global_id(1); \
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0); \
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \
    int2 coord_para = (int2)(gidz, 0); \
    read_type src0, src2; \
    float scale_vari, bias_val; \
@ -60,15 +61,16 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \
    int8 input_desc, output_desc; \
    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \
-    _viv_asm(MOV, coord.z, baseAddr_a); \
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \
    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \
-    _viv_asm(MOV, coord.w, baseAddr); \
+    _viv_asm(MOV, coord.z, baseAddr); \
 \
    for(coord.y = 0; coord.y < height; coord.y++) \
    { \
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.y ++; \
    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
             uniConvert1stUint8SubZpToFp32_4x4); \
    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
@ -87,7 +89,7 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \
    norm = tmpData3 * alpha + bias_val; \
    tmpVal1 = convert_int4_rte(norm); \
    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
-    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+    VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
    } \
 }
 INSTANCENORM_8BITS_F32(U8, vxc_uchar16)
@ -166,7 +168,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F
    image2d_array_t output, float eps, int rsFlg)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    int2 coord_para = (int2)(gidz, 0);
    vxc_short8 src0, src2;
    float scale_vari, bias_val;
@ -201,15 +204,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F
    int8 input_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr);
+    _viv_asm(MOV, coord.z, baseAddr);
    for(coord.y = 0; coord.y < height; coord.y++)
    {
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord_in.y ++;
    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
        uniConvertInt16Fp32Fst_4x4);
    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
@ -221,7 +225,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F
    tmpVal1 = convert_int4_rte(norm);
    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
        uniConvertInt32toInt16_2x8);
-    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \
+    VXC_OP4_NoDest(img_store_3d, output, coord, src2, \
                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx
@ -17,7 +17,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
    int gidx = get_global_id(0) << 3;
    int lidx = get_local_id(0);
    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
+    int4 coord = (int4)(gidx, 0, gidz, gidz);
    vxc_short8 src0, src1, src2;
    float4 srcA, srcB;
    vxc_float sum = 0, sqr = 0;
@ -134,7 +134,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
    image2d_array_t output, float eps, int rsFlg)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    vxc_short8 src0, src1, src2;
    float scale_vari, bias_val;
    vxc_float4 mean_vari = (vxc_float4)(0);
@ -144,7 +145,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
    Image img3 = create_image_from_image2d(meanVari, 4);
    __global float* bias_ptr = (__global float*)img1.ptr;
    __global float* scal_ptr = (__global float*)img2.ptr;
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));
    __global float4* vari_ptr = (__global float4*)sumVari_ptr;

    float bval = bias_ptr[gidz];
@ -166,16 +167,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
    int8 input_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr);
+    _viv_asm(MOV, coord.z, baseAddr);

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord_in.y ++;
    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
                    uniConvBF16toF32_Part0_2x8);
    VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
@ -189,7 +191,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
    norm = scale_vari * tmpData1 + bias_val;
    _viv_asm(COPY, src1, norm, 16);
    VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
-    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \
+    VXC_OP4_NoDest(img_store_3d, output, coord, src2, \
                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx
@ -13,7 +13,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F
    image2d_array_t output, float eps, int rsFlg)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    vxc_short8 src0;
    vxc_half8  in_h;
    float scale_vari, bias_val;
@ -24,7 +25,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F
    Image img3 = create_image_from_image2d(meanVari, 4);
    __global float* bias_ptr = (__global float*)img1.ptr;
    __global float* scal_ptr = (__global float*)img2.ptr;
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));
    __global float4* vari_ptr = (__global float4*)sumVari_ptr;

    float bval = bias_ptr[gidz];
@ -49,18 +50,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F
    int8 input_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr);
+    _viv_asm(MOV, coord.z, baseAddr);

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, in_h, src0, 16);

+    coord_in.y ++;
+
    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
        UniFP16toFP32Lo4_dp4x4);
    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
@ -74,7 +77,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F
    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
        uniConvertHalfToFp16_2x8);
    _viv_asm(COPY, outval, dst, 16);
-    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \
+    VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx
@ -29,7 +29,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
    int gidx = get_global_id(0) << 4;
    int lidx = get_local_id(0);
    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
+    int4 coord = (int4)(gidx, 0, gidz, gidz);
    vxc_uchar16 src0;
    float sum = 0, sqr = 0;
    int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;
@ -44,7 +44,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
    {
        for(coord.y = 0; coord.y < height;)
        {
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+            VXC_OP4(img_load_3d, src0, input, coord, 0, \
                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
            coord.y++;
            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
@ -96,7 +96,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
    {
        for(; coord.y < endH;)
        {
-            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+            VXC_ReadImage(src0, input, coord, 0,
                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
            coord.y++;
            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
@ -133,7 +133,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
    image2d_array_t output, float eps, int rsFlg)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    int4 coord_para = (int4)(gidz, 0, 0, 0);
    vxc_uchar16 src0, src2;
    vxc_short8 src1;
@ -141,7 +142,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
    float scale_vari, bias_val;
    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);

-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, scale_h, src1, 16);
    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
@ -166,15 +167,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
    int8 input_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(MOV, coord_in.z, baseAddr_a);
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr);
+    _viv_asm(MOV, coord.z, baseAddr);

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord_in, 0, \
                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_in.y ++;
    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
@ -189,7 +191,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
    norm = tmpData3 * alpha + bias_val;
    tmpVal1 = convert_int4_rte(norm);
    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
    }
 }

@ -208,7 +210,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
    float scale_vari, bias_val;
    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);

-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, scale_h, src1, 16);
    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
@ -232,7 +234,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to

    for(; coord.y < endH; coord.y++)
    {
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx
@ -19,7 +19,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
    image2d_array_t output, float eps, int rsFlg)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
    int4 coord_para = (int4)(gidz, 0, 0, 0);
    vxc_uchar16 src0;
    vxc_short8 src1, outval;
--- a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
@ -6,14 +6,6 @@ do \
 VXC_OP3_NoDest(vstore3, Pointer, byteOffset, Data); } \
 while(0)

-inline uchar* get_image2D_array_ptr(image2d_array_t  input)
-{
-    int8 desc;
-    _viv_asm(COPY, desc, input, sizeof(desc));
-    uchar *src_ptr = (uchar*)desc.s0;
-    return src_ptr;
-}
-
 #define L2NORMSCALE_SWITCH_PROCESS(case_value, vec_val, ZpValue) \
                switch (case_value) \
                { \
@ -104,8 +96,10 @@ _viv_uniform int inputZP;

 #define L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \
    vxc_float4 rsqrt0;\
-    dst_type  *dst_ptr = (dst_type *)get_image2D_array_ptr(output); \
-    short *scale_ptr = (short *)get_image2D_array_ptr(scale); \
+    Image dst_img = create_image_from_image2d(output, 1); \
+    dst_type  *dst_ptr = (dst_type *)dst_img.ptr; \
+    Image s_img = create_image_from_image2d(scale, 2); \
+    short *scale_ptr = (short *)s_img.ptr; \
    vxc_float4 vec0, vec1;\
    convert_type dst0, dst1;\
    vxc_short8 scale_s16;\
@ -188,15 +182,16 @@ _viv_uniform int inputZP;
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
     void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \
    (\
-    __read_only  image2d_array_t input,\
-    __read_only  image2d_array_t scale,\
-    __write_only image2d_array_t output,\
+    __read_only  image2d_t input,\
+    __read_only  image2d_t scale,\
+    __write_only image2d_t output,\
    int axis\
    )\
 { \
    int lidx = get_local_id(0); \
    int offset  = get_global_id(0); \
-    read_type *src_ptr_base = (read_type *)get_image2D_array_ptr(input); \
+    Image src_img = create_image_from_image2d(input, 1); \
+    read_type *src_ptr_base = (read_type *)src_img.ptr; \
    read_type *src_ptr; \
    read_type2 src0, src1; \
    src_type   val0, val1; \
@ -267,7 +262,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
 { \
    int lidx = get_local_id(0); \
    int offset  = get_global_id(0); \
-    uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input); \
+    Image src_img = create_image_from_image2d(input, 1);
+    uchar *src_ptr_base = (uchar *)src_img.ptr; \
    uchar *src_ptr; \
    vxc_uchar8 src0, src1; \
    vxc_uchar8   val0, val1; \
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx
@ -11,7 +11,7 @@ __kernel void layer_norm_F16toF16(
    image2d_array_t input, image2d_t bias, image2d_t scale,
    image2d_array_t output, float eps)
 {
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
    int4 coord_out = coord;

    int8 input_desc, output_desc;
@ -21,18 +21,18 @@ __kernel void layer_norm_F16toF16(

    vxc_short8 src0, src1;
    vxc_float sum = 0, sqr = 0;
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.w, baseAddr);
+    _viv_asm(MOV, coord_out.z, baseAddr);

    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
    {
        vxc_half8  val0_h;
        _viv_asm(COPY, val0_h, src0, 16);
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        vxc_float4 sumsqr;
        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
@ -49,7 +49,7 @@ __kernel void layer_norm_F16toF16(
    vxc_float4 bias_f;
    for(coord.x = 0; coord.x < width; coord.x += 4)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
@ -73,7 +73,7 @@ __kernel void layer_norm_F16toF16(
        vxc_short8 dstval;
        _viv_asm(COPY, dstval, dst, 16);
        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
    }
 }
@ -99,7 +99,7 @@ __kernel void layer_norm_U8toU8(
    image2d_array_t input, image2d_t bias, image2d_t scale,
    image2d_array_t output, float eps)
 {
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
    int4 coord_out = coord;

    vxc_uchar16 src0, src2;
@ -119,11 +119,11 @@ __kernel void layer_norm_U8toU8(

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.w, baseAddr);
+    _viv_asm(MOV, coord_out.z, baseAddr);

    for(coord.x = 0; coord.x < width; coord.x += 16)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
        tmpSum += (tmpSum1.x);
@ -144,7 +144,7 @@ __kernel void layer_norm_U8toU8(

    for(coord.x = 0; coord.x < width; coord.x += 16)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
@ -203,7 +203,7 @@ __kernel void layer_norm_U8toU8(
        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
            uniConvertInt32toUint8_2x8);
        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \
                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
    }
 }
@ -212,7 +212,7 @@ __kernel void layer_norm_F16toU8(
    image2d_array_t input, image2d_t bias, image2d_t scale,
    image2d_array_t output, float eps)
 {
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
    int4 coord_out = coord;

    int8 input_desc, output_desc;
@ -222,18 +222,18 @@ __kernel void layer_norm_F16toU8(

    vxc_short8 src0, src1;
    vxc_float sum = 0, sqr = 0;
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.w, baseAddr);
+    _viv_asm(MOV, coord_out.z, baseAddr);

    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
    {
        vxc_half8  val0_h;
        _viv_asm(COPY, val0_h, src0, 16);
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        vxc_float4 sumsqr;
        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
@ -250,7 +250,7 @@ __kernel void layer_norm_F16toU8(
    vxc_float4 bias_f;
    for(coord.x = 0; coord.x < width; coord.x += 4)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
@ -273,7 +273,7 @@ __kernel void layer_norm_F16toU8(
        VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),
            uniConvertInt32toUint8_2x8);
        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx
@ -21,24 +21,25 @@ __kernel void layer_norm_I16toI16(
    image2d_array_t input, image2d_t bias, image2d_t scale,
    image2d_array_t output, float eps)
 {
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));

    int8 input_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr);
+    _viv_asm(MOV, coord.z, baseAddr);

    vxc_short8 src0, src1, dst;
    vxc_float sum = 0, sqr = 0;
-    for(; coord.x < width;)
+    for(; coord_in.x < width;)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord.x += 8;
+        coord_in.x += 8;
        vxc_float4 sumsqr;
        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
                    uniInt16SumSqr_dp8x2);
@ -60,11 +61,11 @@ __kernel void layer_norm_I16toI16(

    int2 coord_bias = (int2)(0, 0);

-    for(coord.x = 0; coord.x < width; coord.x += 8)
+    for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_bias.x = coord.x;
+        coord_bias.x = coord_in.x;
        VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        bias_f0 = read_imagef(bias, coord_bias);
@ -92,7 +93,7 @@ __kernel void layer_norm_I16toI16(

        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
                uniConvertInt32toUint8_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \
+        VXC_OP4_NoDest(img_store_3d, output, coord, dst, \
                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx
@ -11,7 +11,7 @@ __kernel void layer_norm_F16F32toF16(
    image2d_array_t input, image2d_t bias, image2d_t scale,
    image2d_array_t output, float eps)
 {
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
    int4 coord_out = coord;

    int8 input_desc, output_desc;
@ -21,20 +21,20 @@ __kernel void layer_norm_F16F32toF16(

    vxc_short8 src0;
    vxc_float sum = 0, sqr = 0;
-    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    Image img1 = create_image_from_image2d(bias, 4);
    Image img2 = create_image_from_image2d(scale, 4);

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.w, baseAddr);
+    _viv_asm(MOV, coord_out.z, baseAddr);

    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
    {
        vxc_half8  val0_h;
        _viv_asm(COPY, val0_h, src0, 16);
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        vxc_float4 sumsqr;
        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
@ -49,11 +49,11 @@ __kernel void layer_norm_F16F32toF16(
    vari += eps;
    vari = rsqrt(vari);
    vxc_float4 bias_f, scale_f, in_f;
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));
    for(coord.x = 0; coord.x < width; coord.x += 4)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
        bias_f = vload4(0, bias_ptr + coord.x);
        scale_f = vload4(0, scale_ptr + coord.x);
@ -72,7 +72,7 @@ __kernel void layer_norm_F16F32toF16(
        vxc_short8 dstval;
        _viv_asm(COPY, dstval, dst, 16);
        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
    }
 }
@ -100,7 +100,7 @@ __kernel void layer_norm_U8F32toU8(
    image2d_array_t input, image2d_t bias, image2d_t scale,
    image2d_array_t output, float eps)
 {
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
    int4 coord_out = coord;

    vxc_uchar16 src0, src2;
@ -118,11 +118,11 @@ __kernel void layer_norm_U8F32toU8(

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.w, baseAddr);
+    _viv_asm(MOV, coord_out.z, baseAddr);

    for(coord.x = 0; coord.x < width; coord.x += 16)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
        tmpSum += (tmpSum1.x);
@ -142,11 +142,11 @@ __kernel void layer_norm_U8F32toU8(

    Image img1 = create_image_from_image2d(bias, 4);
    Image img2 = create_image_from_image2d(scale, 4);
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));
    for(coord.x = 0; coord.x < width; coord.x += 16)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        bias_f0 = vload4(0, bias_ptr);
        bias_f1 = vload4(1, bias_ptr);
@ -193,7 +193,7 @@ __kernel void layer_norm_U8F32toU8(
        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
            uniConvertInt32toUint8_2x8);
        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \
                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
    }
 }
@ -202,24 +202,25 @@ __kernel void layer_norm_I16F32toI16(
    image2d_array_t input, image2d_t bias, image2d_t scale,
    image2d_array_t output, float eps)
 {
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));

    int8 input_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr);
+    _viv_asm(MOV, coord.z, baseAddr);

    vxc_short8 src0, dst;
    vxc_float sum = 0, sqr = 0;
-    for(; coord.x < width;)
+    for(; coord_in.x < width;)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord.x += 8;
+        coord_in.x += 8;
        vxc_float4 sumsqr;
        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
                    uniInt16SumSqr_dp8x2);
@ -243,9 +244,9 @@ __kernel void layer_norm_I16F32toI16(
    Image img2 = create_image_from_image2d(scale, 4);
    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias);
    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias);
-    for(coord.x = 0; coord.x < width; coord.x += 8)
+    for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        bias_f0 = vload4(0, bias_ptr);
        bias_f1 = vload4(1, bias_ptr);
@ -269,7 +270,7 @@ __kernel void layer_norm_I16F32toI16(

        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
                uniConvertInt32toUint8_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \
+        VXC_OP4_NoDest(img_store_3d, output, coord, dst, \
                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx
@ -11,7 +11,7 @@ __kernel void layer_norm_BF16F32toBF16(
    image2d_array_t input, image2d_t bias, image2d_t scale,
    image2d_array_t output, float eps)
 {
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
    int4 coord_out = coord;

    int8 input_desc, output_desc;
@ -30,7 +30,7 @@ __kernel void layer_norm_BF16F32toBF16(

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.w, baseAddr);
+    _viv_asm(MOV, coord_out.z, baseAddr);
    float4 srcA, srcB;
    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
    {
@ -40,7 +40,7 @@ __kernel void layer_norm_BF16F32toBF16(
                     uniConvBF16toF32_Part1_2x8);
        _viv_asm(COPY, srcA, src1, 16);
        _viv_asm(COPY, srcB, src2, 16);
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        sum += dot(srcA, ones) + dot(srcB, ones);
        sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);
@ -52,12 +52,12 @@ __kernel void layer_norm_BF16F32toBF16(
    vari += eps;
    vari = rsqrt(vari);
    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));

    for(coord.x = 0; coord.x < width; coord.x += 8)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        bias_f0 = vload4(0, bias_ptr);
        bias_f1 = vload4(1, bias_ptr);
@ -85,7 +85,7 @@ __kernel void layer_norm_BF16F32toBF16(
        VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);

        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \
                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx
@ -26,7 +26,7 @@ __kernel void layer_norm_U8toF16(
    image2d_array_t output,
              float eps)
 {
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
    int4 coord_out = coord;
    vxc_uchar16 src0;
    float sum = 0, sqr = 0;
@ -41,11 +41,11 @@ __kernel void layer_norm_U8toF16(

    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.w, baseAddr);
+    _viv_asm(MOV, coord_out.z, baseAddr);

    for(coord.x = 0; coord.x < width; coord.x += 16)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
        tmpSum += (tmpSum1.x);
@ -71,7 +71,7 @@ __kernel void layer_norm_U8toF16(

    for(coord.x = 0; coord.x < width; coord.x += 16)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
@ -121,7 +121,7 @@ __kernel void layer_norm_U8toF16(
            UniPackFP16even_2x8);
        _viv_asm(COPY, outval, dst, 16);
        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \
                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));

        tmpData2 -= mean;
@ -135,7 +135,7 @@ __kernel void layer_norm_U8toF16(
            UniPackFP16even_2x8);
        _viv_asm(COPY, outval, dst, 16);
        coord_out.x += 8;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \
                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx
@ -39,7 +39,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq
    {
        for(coord.y = 0; coord.y < height;)
        {
-            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
            coord.y++;
            _viv_asm(COPY, in_h, src0, 16);
@ -134,7 +134,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
    image2d_array_t output, float eps)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    int2 coord_sum = (int2)(0, gidz);
    int4 coord_para = coord;
    coord_para.z = (ushort)gidz / (ushort)(height_depth);
@ -157,8 +158,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to

    int8 input_desc, scale_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr_a);
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
@ -175,11 +176,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
        coord_para.y = coord.y;
        coord_bias.y = coord.y;
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),
                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        bias_f0 = read_imagef(bias, coord_bias);
        coord_bias.x += 4;
@ -208,7 +210,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
                    uniConvertHalfToFp16_2x8);
        _viv_asm(COPY, outval, dst, 16);
-        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
@ -284,7 +286,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
    image2d_array_t output, float eps)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    int2 coord_sum = (int2)(0, gidz);
    int4 coord_para = coord;
    coord_para.z = (ushort)gidz / (ushort)(height_depth);
@ -307,8 +310,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to

    int8 input_desc, scale_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr_a);
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
@ -324,11 +327,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
        coord_para.y = coord.y;
        coord_bias.y = coord.y;
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),
                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        bias_f0 = read_imagef(bias, coord_bias);
        coord_bias.x += 4;
@ -356,7 +360,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
                uniConvertInt32toUint8_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx
@ -43,7 +43,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq
    {
        for(coord.y = 0; coord.y < height;)
        {
-            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),
                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
            coord.y++;
            vxc_float4 sumsqr;
@ -130,7 +130,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to
    image2d_array_t output, float eps)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    int2 coord_sum = (int2)(0, gidz);
    int4 coord_para = coord;
    coord_para.z = (ushort)gidz / (ushort)(height_depth);
@ -152,8 +153,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to

    int8 input_desc, scale_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr_a);
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
@ -169,11 +170,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
        coord_para.y = coord.y;
        coord_bias.y = coord.y;
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),
                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        bias_f0 = read_imagef(bias, coord_bias);
        coord_bias.x += 4;
@ -199,7 +201,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to

        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
                uniConvertInt32toUint8_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx
@ -48,7 +48,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq
    {
        for(coord.y = 0; coord.y < height;)
        {
-            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,
                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
            coord.y++;
            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
@ -101,7 +101,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq
    {
        for(; coord.y < endH;)
        {
-            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+            VXC_ReadImage(src0, input, coord, 0,
                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
            coord.y++;
            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
@ -138,7 +138,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF
    image2d_array_t output, float eps)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    int2 coord_sum = (int2)(0, gidz);
    int4 coord_para = coord;
    coord_para.z = (ushort)gidz / (ushort)(height_depth);
@ -161,8 +162,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF

    int8 input_desc, scale_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr_a);
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
@ -178,10 +179,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
        coord_para.y = coord.y; coord_bias.y = coord.y;
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,
                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        bias_f0 = read_imagef(bias, coord_bias);
        coord_bias.x += 4;
@ -208,7 +210,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF
        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
                uniConvertHalfToFp16_2x8);
        _viv_asm(COPY, outval, dst, 16);
-        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
@ -242,10 +244,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_ReadImage(src0, input, coord, 0,\
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        coord_bias.y = coord.y;
-        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_ReadImage(src1, scale, coord, 0,\
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        bias_f0 = read_imagef(bias, coord_bias);
        coord_bias.x += 4;
@ -281,7 +283,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU
    image2d_array_t output, float eps)
 {
    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
    int2 coord_sum = (int2)(0, gidz);
    int4 coord_para = coord;
    coord_para.z = (ushort)gidz / (ushort)(height_depth);
@ -304,8 +307,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU

    int8 input_desc, scale_desc, output_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr_a);
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);

    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
@ -321,11 +324,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
        coord_para.y = coord.y;
        coord_bias.y = coord.y;
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,
                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        bias_f0 = read_imagef(bias, coord_bias);
        coord_bias.x += 4;
@ -351,7 +355,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU

        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
                uniConvertInt32toUint8_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    }
 }
@ -385,10 +389,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU

    for(coord.y = 0; coord.y < height; coord.y++)
    {
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_ReadImage(src0, input, coord, 0,\
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        coord_bias.y = coord.y;
-        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_ReadImage(src1, scale, coord, 0,\
                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        bias_f0 = read_imagef(bias, coord_bias);
        coord_bias.x += 4;
--- a/Show More
+++ b/Show More