Update internal to REL/v1.1.30.2

SHA: 2e64046f Signed-off-by: Kainan Cha <kainan.zha@verisilicon.com>
2021-03-29 16:21:46 +08:00 · 2021-03-29 16:21:46 +08:00 · c141416238
parent b5f2666e92
commit c141416238
120 changed files with 14252 additions and 11997 deletions
--- a/src/tim/vx/internal/BUILD
+++ b/src/tim/vx/internal/BUILD
@ -194,22 +194,13 @@ cc_library(
        "src/kernel/vsi_nn_kernel_param.c",
        "src/kernel/vsi_nn_gpu.c",
        "src/kernel/vsi_nn_kernel_gpu_shape_optimize.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_crop.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_resize.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_scale.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c",
        "src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c",
        "src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c",
        "src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c",
        "src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c",
        "src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c",
        "src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c",
        "src/libnnext/ops/kernel/vsi_nn_kernel_topk.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c",
        "src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c",
        "src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c",
        "src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c",
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -146,3 +146,4 @@ DEF_OP(SCATTER_ND)
 DEF_OP(DECONVOLUTION1D)
 DEF_OP(INTERP)
 DEF_OP(RESIZE_1D)
+DEF_OP(UPSAMPLESCALE)
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@ -16,3 +16,4 @@ DEF_OP(GRUCELL_ACTIVATION_INTERNAL)
 DEF_OP(GRUCELL_ACTIVATION_INTERNAL_SMA)
 DEF_OP(RESIZE_1D_BILINEAR_INTERNAL)
 DEF_OP(RESIZE_1D_NEAREST_INTERNAL)
+DEF_OP(SPACE2DEPTH_INTERNAL)
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
@ -38,6 +38,14 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape
    int32_t* out_axis, uint32_t* out_axis_size
    );

+vsi_bool vsi_nn_kernel_optimize_tensor_shape
+    (
+    const int32_t* shape_x, const size_t rank_x,
+    const int32_t *axis, const size_t axis_size,
+    int32_t* out_shape_x, uint32_t* out_rank_x,
+    int32_t* out_axis, uint32_t* out_axis_size
+    );
+
 vsi_bool vsi_nn_kernel_optimize_element_shape
    (
    const int32_t* shape_x, const size_t rank_x,
@ -59,4 +67,16 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
    int32_t* out_shape_output, uint32_t* out_rank_output
    );

+vsi_bool vsi_nn_kernel_optimize_1d_tensor_shape
+    (
+    const int32_t* shape, const uint32_t rank,
+    int32_t* out_shape, uint32_t* out_rank
+    );
+
+vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
+    (
+    const int32_t* shape, const uint32_t rank,
+    int32_t* out_shape, uint32_t* out_rank
+    );
+
 #endif
--- a/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h
+++ b/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h
@ -372,10 +372,6 @@ enum vx_kernel_libnnext_offset_e
 #define VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT16_COPY        VIVANTE_NAMESPACE ".GrayScaletoTensor_Int16_copy"
 #define VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8             VIVANTE_NAMESPACE ".GrayScaletoTensor_UInt8"
 #define VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8_COPY        VIVANTE_NAMESPACE ".GrayScaletoTensor_UInt8_copy"
-#define VX_KERNEL_NAME_LAYERNORM                           VIVANTE_NAMESPACE ".vxcLayerNorm"
-#define VX_KERNEL_NAME_LAYERNORM_UINT8                     VIVANTE_NAMESPACE ".vxcLayerNorm_u8"
-#define VX_KERNEL_NAME_LAYERNORM_FP16TOU8                  VIVANTE_NAMESPACE ".vxcLayerNormFP16toU8"
-#define VX_KERNEL_NAME_LAYERNORM_U8TOFP16                  VIVANTE_NAMESPACE ".vxcLayerNormU8toFp16"
 #define VX_KERNEL_NAME_TENSORSTACKCONCAT                   VIVANTE_NAMESPACE ".vxcTensorStackConcat"
 #define VX_KERNEL_NAME_TENSORSTACKCONCAT8BITS              VIVANTE_NAMESPACE ".vxcTensorStackConcat8Bits"
 #define VX_KERNEL_NAME_SIGNALFRAME_WIDTH                   VIVANTE_NAMESPACE ".vxcSignalFrame_width"
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h
@ -70,6 +70,10 @@ typedef struct _vsi_nn_instancenorm_lcl_data2
    uint32_t reshapeFlg;
    uint32_t hash_idx;
    vsi_bool execute_on_sw;
+
+    /* handle 3D instance norm */
+    vsi_nn_tensor_t *reshaped_input;
+    vsi_nn_tensor_t *reshaped_output;
 } vsi_nn_instancenorm_lcl_data2;

 typedef struct _vsi_nn_instancenorm_lcl_data
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_space2depth_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_space2depth_internal.h
@ -0,0 +1,44 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_SPACE2DEPTH_INTERNAL_H
+#define _VSI_NN_OP_SPACE2DEPTH_INTERNAL_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_space2depth_internal_param
+{
+    int32_t block_size_x;
+    int32_t block_size_y;
+} vsi_nn_space2depth_internal_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h
@ -0,0 +1,39 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_UPSAMPLESCALE_H
+#define _VSI_NN_OP_UPSAMPLESCALE_H
+
+#include "vsi_nn_types.h"
+
+typedef struct _vsi_nn_upsamplescale_param
+{
+    struct _upsamplescale_local_data_t* local;
+    // Add parameters here
+    int32_t stride;
+    float scale;
+} vsi_nn_upsamplescale_param;
+
+#endif
+
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@ -677,6 +677,11 @@ OVXLIB_API vsi_status vsi_nn_TrySetupCompleteSignalNode
    vsi_nn_graph_t* graph
    );

+vsi_status vsi_nn_setup_binary_graph_inputs_outputs
+    (
+    vsi_nn_graph_t* graph
+    );
+
 void  vsi_nn_get_tensor_consumers
    (
    vsi_nn_graph_t* graph,
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -56,6 +56,7 @@
 #include "ops/vsi_nn_op_elu.h"
 #include "ops/vsi_nn_op_reverse.h"
 #include "ops/vsi_nn_op_space2depth.h"
+#include "ops/vsi_nn_op_space2depth_internal.h"
 #include "ops/vsi_nn_op_depth2space.h"
 #include "ops/vsi_nn_op_depth2space_internal.h"
 #include "ops/vsi_nn_op_maximum.h"
@ -162,6 +163,7 @@
 #include "ops/vsi_nn_op_resize_1d.h"
 #include "ops/vsi_nn_op_resize_1d_bilinear_internal.h"
 #include "ops/vsi_nn_op_resize_1d_nearest_internal.h"
+#include "ops/vsi_nn_op_upsamplescale.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"

@ -204,6 +206,7 @@ typedef union _vsi_nn_nn_param
    vsi_nn_elu_param                elu;
    vsi_nn_reverse_param            reverse;
    vsi_nn_space2depth_param        space2depth;
+    vsi_nn_space2depth_internal_param space2depth_internal;
    vsi_nn_depth2space_param        depth2space;
    vsi_nn_depth2space_internal_param depth2space_internal;
    vsi_nn_maximum_param            maximum;
@ -310,6 +313,7 @@ typedef union _vsi_nn_nn_param
    vsi_nn_resize_1d_param          resize_1d;
    vsi_nn_resize_1d_bilinear_internal_param resize_1d_bilinear_internal;
    vsi_nn_resize_1d_nearest_internal_param resize_1d_nearest_internal;
+    vsi_nn_upsamplescale_param      upsamplescale;
    uint8_t                         client_param[128];

    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@ -65,6 +65,12 @@ typedef enum
    VSI_NN_SOURCE_LAYOUT_NCHW,
 } vsi_nn_preprocess_source_layout_e;

+typedef enum
+{
+    VSI_NN_DEST_LAYOUT_NHWC = 0,
+    VSI_NN_DEST_LAYOUT_NCHW,
+} vsi_nn_preprocess_dest_layout_e;
+
 /**
 * Input source format
 */
--- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
@ -214,7 +214,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
    width = input_shape->data[0];
    height = input_shape->data[1];
    chn = attr[1]->shape->data[1];
-    if(rsFlg)
+    if (rsFlg)
    {
        height = height / chn;
    }
@ -281,7 +281,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
    width = input_shape->data[0];
    height = input_shape->data[1];
    chn = attr[1]->shape->data[1];
-    if(rsFlg)
+    if (rsFlg)
    {
        height = height / chn;
    }
@ -355,12 +355,12 @@ static vsi_status _query_kernel

    for( i = 0; i < kernel_map_size; i ++ )
    {
-        if( kernel_map[i].key == hashkey )
+        if ( kernel_map[i].key == hashkey )
        {
            break;
        }
    }
-    if( i < kernel_map_size )
+    if ( i < kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
@ -413,19 +413,23 @@ static vsi_nn_kernel_node_t _setup
    int32_t width = inputs[0]->attr.size[0];
    int32_t height = inputs[0]->attr.size[1];
    int32_t group_num = (width + 15) / 16;
-    int32_t input_zp = inputs[0]->attr.dtype.zero_point;
-    float input_scale = inputs[0]->attr.dtype.scale;
-    int32_t input_fl = inputs[0]->attr.dtype.fl;
-    int32_t output_zp = outputs[0]->attr.dtype.zero_point;
-    float output_scale = outputs[0]->attr.dtype.scale;
-    int32_t output_fl = outputs[0]->attr.dtype.fl;
+    int32_t input_zp = 0;
+    float input_scale = 1.0f;
+    int32_t input_fl = 0;
+    int32_t output_zp = 0;
+    float output_scale = 1.0f;
+    int32_t output_fl = 0;
    float in_fl_scale = 1.0f, out_fl_scale = 1.0;
    float dim_ratio = (float)1.0 / (float)(width * height);

-    if(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT8
-        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
-        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
    {
+        input_zp = inputs[0]->attr.dtype.zero_point;
+        input_scale = inputs[0]->attr.dtype.scale;
+    }
+    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        input_fl = inputs[0]->attr.dtype.fl;
        if (input_fl > 0)
        {
            in_fl_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
@ -434,12 +438,17 @@ static vsi_nn_kernel_node_t _setup
        {
            in_fl_scale = ((float) ((int64_t)1 << -input_fl));
        }
+        input_zp = 0;
    }

-    if(outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT8
-        || outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
-        || outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
    {
+        output_zp = outputs[0]->attr.dtype.zero_point;
+        output_scale = 1.0f / outputs[0]->attr.dtype.scale;
+    }
+    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        output_fl = outputs[0]->attr.dtype.fl;
        if (output_fl > 0)
        {
            out_fl_scale = (float)((int64_t)1 << output_fl);
@ -448,9 +457,10 @@ static vsi_nn_kernel_node_t _setup
        {
            out_fl_scale = (1.0f / (float)((int64_t)1 << -output_fl));
        }
+        output_zp = 0;
    }

-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
@ -482,17 +492,17 @@ static vsi_nn_kernel_node_t _setup
    hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );

    status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
-    if( VSI_SUCCESS != status )
+    if ( VSI_SUCCESS != status )
    {
        goto final;
    }
    status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
-    if( VSI_SUCCESS != status )
+    if ( VSI_SUCCESS != status )
    {
        goto final;
    }

-    if(reshape_flg)
+    if (reshape_flg)
    {
        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
        shape[0] = inputs[0]->attr.size[0];
@ -507,7 +517,7 @@ static vsi_nn_kernel_node_t _setup
        shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
        rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
    }
-    if(inputs[1]->attr.dim_num < 2)
+    if (inputs[1]->attr.dim_num < 2)
    {
        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
        shape[0] = inputs[1]->attr.size[0];
@ -516,7 +526,7 @@ static vsi_nn_kernel_node_t _setup
        shape[3] = 1;
        rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
    }
-    if(inputs[2]->attr.dim_num < 2)
+    if (inputs[2]->attr.dim_num < 2)
    {
        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
        shape[0] = inputs[2]->attr.size[0];
@ -528,10 +538,10 @@ static vsi_nn_kernel_node_t _setup
    // Mean Vari
    {
        node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
-        if(node)
+        if (node)
        {
            uint32_t index = 0;
-            if(reshape_flg)
+            if (reshape_flg)
            {
                mean_vari_node_params[index++] = rs_input;
            }
@ -565,10 +575,10 @@ static vsi_nn_kernel_node_t _setup
    // Nomalization
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if(node)
+        if (node)
        {
            uint32_t index = 0;
-            if(reshape_flg)
+            if (reshape_flg)
            {
                node_params[index++] = rs_input;
            }
@ -576,7 +586,7 @@ static vsi_nn_kernel_node_t _setup
            {
                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
            }
-            if(inputs[1]->attr.dim_num < 2)
+            if (inputs[1]->attr.dim_num < 2)
            {
                node_params[index++] = rs_beta;
            }
@ -584,7 +594,7 @@ static vsi_nn_kernel_node_t _setup
            {
                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
            }
-            if(inputs[2]->attr.dim_num < 2)
+            if (inputs[2]->attr.dim_num < 2)
            {
                node_params[index++] = rs_gamma;
            }
@ -593,7 +603,7 @@ static vsi_nn_kernel_node_t _setup
                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
            }
            node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
-            if(reshape_flg)
+            if (reshape_flg)
            {
                node_params[index++] = rs_output;
            }
@ -634,26 +644,26 @@ static vsi_nn_kernel_node_t _setup

    /* Pass parameters to node. */
 final:
-    if(rs_beta)
+    if (rs_beta)
    {
        vsi_nn_kernel_tensor_release( &rs_beta );
    }
-    if(rs_gamma)
+    if (rs_gamma)
    {
        vsi_nn_kernel_tensor_release( &rs_gamma );
    }
-    if(reshape_flg)
+    if (reshape_flg)
    {
        vsi_nn_kernel_tensor_release( &rs_input );
        vsi_nn_kernel_tensor_release( &rs_output );
    }
    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
    {
-        if( ikernels[i] )
+        if ( ikernels[i] )
        {
            vsi_nn_kernel_release( &ikernels[i] );
        }
-        if( tensors[i] )
+        if ( tensors[i] )
        {
            vsi_nn_ReleaseTensor( &tensors[i] );
        }
--- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
@ -0,0 +1,395 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define KERNEL_SOURCE_1    "layer_normalization"
+
+#define HASH_LAYERNORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.layer_norm_"#SRC0_TYPE"to"#DST_TYPE)
+
+// Add kernel hashtable here
+#define HASH_LAYERNORM_KEY(_input0_type, _output_type, _reshape_flag) \
+    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
+
+#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_LAYERNORM_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _layernorm_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_LAYERNORM_KERNELS( F32, F32, KERNEL_SOURCE_1 )
+    TENSOR_LAYERNORM_KERNELS( U8,  U8, KERNEL_SOURCE_1 )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _layernorm_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _LAYERNORM_PARAM_NUM  _cnt_of_array( _layernorm_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+
+DEF_KERNEL_INITIALIZER(_layernorm_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_int_array_t * input_shape = NULL;
+    //int32_t width = 0;
+    int32_t height = 0;
+    int32_t chn = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    input_shape  = attr[0]->shape;
+    //width = input_shape->data[0];
+    height = input_shape->data[1];
+    chn = (input_shape->size <= 2) ? 1 : input_shape->data[2];
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.local_size[0]    = 16;
+    gpu_param.local_size[1]    = 1;
+    gpu_param.local_size[2]    = 1;
+    gpu_param.global_size[0]   = 16;
+    gpu_param.global_size[1]   = height;
+    gpu_param.global_size[2]   = chn;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    return status;
+} /* _layernorm_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t* kernel,
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    int32_t reshape2D
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (input0_dtype == F16 && output_dtype == F16)
+    {
+        input0_dtype = F32;
+        output_dtype = F32;
+    }
+
+    key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, 0 );
+
+    for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ )
+    {
+        if ( _layernorm_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(_layernorm_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _layernorm_kernel_map[i].function_name );
+        kernel->info.parameters = _layernorm_kernel_param_def;
+        kernel->info.numParams = _LAYERNORM_PARAM_NUM;
+        kernel->info.initialize = _layernorm_initializer;
+
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                _layernorm_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _layernorm_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_tensor_t rs_gamma = NULL, rs_beta = NULL;
+
+    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+
+    int32_t width = inputs[0]->attr.size[0];
+    int32_t height = inputs[0]->attr.size[1];
+    int32_t input_fl = 0;
+    float input_zp = 0.0f;
+    float input_scale = 1.0f;
+    int32_t output_fl = 0;
+    float output_zp = 0.0f;
+    float output_scale = 1.0f;
+    float e2InScale = 1.0f, scale_inOut = 1.0f;
+    float dim_ratio = (float)1.0 / (float)(width);
+    float sumZpScale = 0.0f;
+    float zp2ScaleE2 = 0.0f;
+    float sumZpScaleE2 = 0.0f;
+
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        input_zp = (float)inputs[0]->attr.dtype.zero_point;
+        input_scale = inputs[0]->attr.dtype.scale;
+    }
+    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        input_fl = inputs[0]->attr.dtype.fl;
+        if (input_fl > 0)
+        {
+            input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
+        }
+        else
+        {
+            input_scale = ((float) ((int64_t)1 << -input_fl));
+        }
+        input_zp = 0.0f;
+    }
+
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        output_zp = (float)outputs[0]->attr.dtype.zero_point;
+        output_scale = 1.0f / outputs[0]->attr.dtype.scale;
+    }
+    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        output_fl = outputs[0]->attr.dtype.fl;
+        if (output_fl > 0)
+        {
+            output_scale = (float)((int64_t)1 << output_fl);
+        }
+        else
+        {
+            output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
+        }
+        output_zp = 0.0f;
+    }
+    scale_inOut = input_scale * output_scale;
+    e2InScale = input_scale * input_scale;
+    sumZpScale = width * input_zp * input_scale;
+    zp2ScaleE2 = input_zp * 2 * e2InScale;
+    sumZpScaleE2 = width * input_zp * input_zp * e2InScale;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, 0 );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
+    if (inputs[1]->attr.dim_num < 2)
+    {
+        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
+        shape[0] = inputs[1]->attr.size[0];
+        shape[1] = 1;
+        shape[2] = 1;
+        shape[3] = 1;
+        rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
+    }
+    if (inputs[2]->attr.dim_num < 2)
+    {
+        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
+        shape[0] = inputs[2]->attr.size[0];
+        shape[1] = 1;
+        shape[2] = 1;
+        shape[3] = 1;
+        rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 );
+    }
+
+    // Nomalization
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if (node)
+        {
+            uint32_t index = 0;
+            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
+            if (inputs[1]->attr.dim_num < 2)
+            {
+                node_params[index++] = rs_beta;
+            }
+            else
+            {
+                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
+            }
+            if (inputs[2]->attr.dim_num < 2)
+            {
+                node_params[index++] = rs_gamma;
+            }
+            else
+            {
+                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
+            }
+            node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &e2InScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_inOut );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &sumZpScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp2ScaleE2 );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &sumZpScaleE2 );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
+
+            status  = vsi_nn_kernel_node_pass_param( node, node_params,
+                        _LAYERNORM_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+            vsi_nn_kernel_scalar_release( &node_params[14] );
+            vsi_nn_kernel_scalar_release( &node_params[15] );
+            vsi_nn_kernel_scalar_release( &node_params[16] );
+        }
+    }
+
+    /* Pass parameters to node. */
+final:
+    if (rs_beta)
+    {
+        vsi_nn_kernel_tensor_release( &rs_beta );
+    }
+    if (rs_gamma)
+    {
+        vsi_nn_kernel_tensor_release( &rs_gamma );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( layer_norm, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@ -59,6 +59,9 @@ __BEGIN_DECLS
 #define HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
    CVIVANTE_NAMESPACE("cl.gemm_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)

+#define HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
+    CVIVANTE_NAMESPACE("cl.gemm_transb_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
+
 #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0), \
        HASH_MATRIXMUL_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \
@ -69,6 +72,11 @@ __BEGIN_DECLS
        HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \
        SOURCE },

+#define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
+    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2), \
+        HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
+        SOURCE },
+
 static const struct {
        uint32_t key;
        char* function_name;
@ -83,6 +91,10 @@ static const struct {
    TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _3D,           KERNEL_SOURCE_1)
    TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _2D,    KERNEL_SOURCE_2)
    TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _3D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _3D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8,  F32, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8,  F32, _3D,    KERNEL_SOURCE_1)
 };

 /*
@ -98,6 +110,12 @@ static vx_param_description_t _matrixmul_kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };

 #define _MATRIXMUL_PARAM_NUM          _cnt_of_array(_matrixmul_kernel_param_def)
@ -130,7 +148,7 @@ DEF_KERNEL_INITIALIZER(_matrixmul_initializer)
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );

    width = attr[0]->shape->data[0];
-    height = attr[0]->shape->data[0];
+    height = attr[0]->shape->data[1];
    chn = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;

    gpu_param.global_scale[0]  = 1;
@ -175,22 +193,27 @@ static vsi_status _query_kernel
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    if(depth > 1)
+    if (depth > 1)
    {
        dim_type = _3D;
    }

+    if (input1_dtype == I16 || input1_dtype == I32)
+    {
+        input1_dtype = I8;
+    }
+
    key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa );

    for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ )
    {
-        if( matrixmul_map[i].key == key )
+        if ( matrixmul_map[i].key == key )
        {
            break;
        }
    }

-    if( i < _cnt_of_array(matrixmul_map) )
+    if ( i < _cnt_of_array(matrixmul_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  matrixmul_map[i].function_name );
        kernel->info.parameters = _matrixmul_kernel_param_def;
@ -223,48 +246,111 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_t node = NULL;
    int32_t transposeA  = vsi_nn_kernel_param_get_int32( params, "transposeA" );
    int32_t transposeB  = vsi_nn_kernel_param_get_int32( params, "transposeB" );
+    int32_t transFlg    = 0;
    uint32_t M = inputs[0]->attr.size[1];
    uint32_t K = inputs[0]->attr.size[0];
    uint32_t N = inputs[1]->attr.size[0];
    uint32_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1;
    uint32_t ac2zero = 0;
    uint32_t bc2zero = 0;
+    float    scale_a = 1.0f;
+    float    zp_a = 0;
+    float    scale_b = 1.0f;
+    float    zp_b = 0;
+    float    scale_out = 1.0f;
+    float    zp_out = 0;

-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
    }

-    if(transposeB)
+    if (transposeB)
    {
-        return NULL;
+        N = inputs[1]->attr.size[1];
+        transFlg = 2;
    }

-    if(transposeA)
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        if (inputs[0]->attr.dtype.fl > 0)
+        {
+            scale_a = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl)));
+        }
+        else
+        {
+            scale_a = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl));
+        }
+        zp_a = 0;
+    }
+    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        zp_a = (float)inputs[0]->attr.dtype.zero_point;
+        scale_a = inputs[0]->attr.dtype.scale;
+    }
+
+    if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        if (inputs[1]->attr.dtype.fl > 0)
+        {
+            scale_b = (1.0f / ((float) ((int64_t)1 << inputs[1]->attr.dtype.fl)));
+        }
+        else
+        {
+            scale_b = ((float) ((int64_t)1 << -inputs[1]->attr.dtype.fl));
+        }
+        zp_b = 0;
+    }
+    else if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        zp_b = (float)inputs[1]->attr.dtype.zero_point;
+        scale_b = inputs[1]->attr.dtype.scale;
+    }
+
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        if (outputs[0]->attr.dtype.fl > 0)
+        {
+            scale_out = (float)((int64_t)1 << outputs[0]->attr.dtype.fl);
+        }
+        else
+        {
+            scale_out = (1.0f / (float)((int64_t)1 << -outputs[0]->attr.dtype.fl));
+        }
+        zp_out = 0;
+    }
+    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        zp_out = (float)outputs[0]->attr.dtype.zero_point;
+        scale_out = outputs[0]->attr.dtype.scale;
+    }
+
+    if (transposeA)
    {
        K = inputs[0]->attr.size[1];
        M = inputs[0]->attr.size[0];
+        transFlg = 1;
    }

-    if((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) ||
+    if ((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) ||
       (inputs[0]->attr.size[2] > inputs[1]->attr.size[2]
            && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2))
    {
        bc2zero = 1;
    }
-    else if((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) ||
+    else if ((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) ||
       (inputs[1]->attr.size[2] > inputs[0]->attr.size[2]
            && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2))
    {
        ac2zero = 1;
    }

-    status = _query_kernel( kernel, inputs, outputs, depth, transposeA );
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( kernel, inputs, outputs, depth, transFlg );
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            uint32_t index = 3;
            /* Pass parameters to node. */
@ -275,6 +361,12 @@ static vsi_nn_kernel_node_t _setup
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ac2zero );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bc2zero );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_a );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_a );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_b );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_out );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_out );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MATRIXMUL_PARAM_NUM );
            CHECK_STATUS(status);
@ -283,6 +375,12 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &node_params[5] );
            vsi_nn_kernel_scalar_release( &node_params[6] );
            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
        }
    }
    return node;
--- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
@ -0,0 +1,329 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+#define _ROI_ALIGN_KERNEL_SOURCE(_input_type)      "roi_align"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, _image_2d ) \
+        (( IN0_DTYPE ) | ( IN1_DTYPE << 7) | (IN2_DTYPE << 14) | (OUT_DTYPE << 21) | (_image_2d << 28))
+
+#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE ) \
+        { ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, 0 ), \
+          CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"to"STR(OUT_DTYPE)), \
+          _ROI_ALIGN_KERNEL_SOURCE(IN0_DTYPE) }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _roi_align_kernel_map[] =
+{
+    PACK_KERNEL_MAP(F32, F32, I32, F32),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _roi_align_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _ROI_ALIGN_PARAM_NUM  _cnt_of_array( _roi_align_kernel_param_def )
+
+#define SCALAR_SPATIAL_X_SCALE          (4)
+#define SCALAR_SPATIAL_Y_SCALE          (5)
+#define SCALAR_INPUT_WIDTH              (6)
+#define SCALAR_INPUT_HEIGHT             (7)
+#define SCALAR_RCP_OF_OUTPUT_WIDTH      (8)
+#define SCALAR_RCP_OF_OUTPUT_HEIGHT     (9)
+#define SCALAR_SAMPLING_X_RATIO         (10)
+#define SCALAR_SAMPLING_Y_RATIO         (11)
+#define SCALAR_DEPTH                    (12)
+
+#define ROI_ALIGN_PARAM_NUM         13
+#define ROI_ALIGN_QUANT_PARAM_NUM   _cnt_of_array( _roi_align_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_roi_align_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * rois_attr     = NULL;
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_int_array_t * rois_shape                = NULL;
+    vsi_int_array_t * out_shape                 = NULL;
+
+    rois_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( rois_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    rois_shape = rois_attr->shape;
+    out_shape  = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.dim = 3;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = rois_shape->data[1];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+
+    return status;
+} /* _roi_align_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e in2_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _roi_align_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _roi_align_kernel_map );
+    vx_param_description_t * param_def  = _roi_align_kernel_param_def;
+    size_t param_def_size               = ROI_ALIGN_QUANT_PARAM_NUM;
+    vx_kernel_initialize_f  initializer = _roi_align_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    in2_dtype  = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    out_dtype  = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    in0_dtype = in0_dtype == F16 ? F32 : in0_dtype;
+    in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
+
+    key = ROI_ALIGN_HASH_KEY( in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+
+} /* _query_kernel() */
+
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ROI_ALIGN_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    uint32_t rank[_IO_NUM] = {0};
+    int32_t  shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
+    int32_t i = 0;
+    float   width_ratio         = vsi_nn_kernel_param_get_float32( params, "width_ratio" );
+    float   height_ratio        = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
+    int32_t width_sample_num    = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
+    int32_t height_sample_num   = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
+    float   width_scale         = 1.0f / width_ratio;
+    float   height_scale        = 1.0f / height_ratio;
+    float   in_width            = (float)(inputs[0]->attr.size[0]);
+    float   in_height           = (float)(inputs[0]->attr.size[1]);
+    float   rcp_of_out_width    = 1.0f / (float)(outputs[0]->attr.size[0]);
+    float   rcp_of_out_height   = 1.0f / (float)(outputs[0]->attr.size[1]);
+    float   sampling_x_ratio    = width_sample_num > 0 ? (float)width_sample_num : 0;
+    float   sampling_y_ratio    = height_sample_num > 0 ? (float)height_sample_num : 0;
+    int     depth               = inputs[0]->attr.size[2];
+
+    vsi_nn_kernel_optimize_nchw2xhw_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
+                                              shapes[0], &rank[0]);
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
+                                              shapes[1], &rank[1]);
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[2]->attr.size, inputs[2]->attr.dim_num,
+                                              shapes[2], &rank[2]);
+    vsi_nn_kernel_optimize_nchw2xhw_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num,
+                                              shapes[3], &rank[3]);
+
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
+            inputs[i], (uint32_t*)shapes[i], rank[i] );
+    }
+    reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph,
+        outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
+                inputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[_INPUT_NUM], image_2d);
+
+    if ( VSI_SUCCESS == status )
+    {
+        size_t node_params_num = ROI_ALIGN_PARAM_NUM;
+
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM,
+                reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num );
+
+            node_params[SCALAR_SPATIAL_X_SCALE]      = vsi_nn_kernel_scalar_create( graph, F32, &width_scale );
+            node_params[SCALAR_SPATIAL_Y_SCALE]      = vsi_nn_kernel_scalar_create( graph, F32, &height_scale );
+            node_params[SCALAR_INPUT_WIDTH]          = vsi_nn_kernel_scalar_create( graph, F32, &in_width );
+            node_params[SCALAR_INPUT_HEIGHT]         = vsi_nn_kernel_scalar_create( graph, F32, &in_height );
+            node_params[SCALAR_RCP_OF_OUTPUT_WIDTH]  = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_width );
+            node_params[SCALAR_RCP_OF_OUTPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_height );
+            node_params[SCALAR_SAMPLING_X_RATIO]     = vsi_nn_kernel_scalar_create( graph, F32, &sampling_x_ratio );
+            node_params[SCALAR_SAMPLING_Y_RATIO]     = vsi_nn_kernel_scalar_create( graph, F32, &sampling_y_ratio );
+            node_params[SCALAR_DEPTH]                = vsi_nn_kernel_scalar_create( graph, I32, &depth );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_X_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_Y_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_HEIGHT] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_RCP_OF_OUTPUT_WIDTH] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_RCP_OF_OUTPUT_HEIGHT] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_X_RATIO] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_Y_RATIO] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] );
+        }
+    }
+
+    for (i = 0; i < _IO_NUM; i++)
+    {
+        if (reshape_tensors[i])
+        {
+            vsi_nn_ReleaseTensor( &reshape_tensors[i] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( roi_align, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
@ -0,0 +1,298 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define KERNEL_SOURCE_1    "space2depth_internal"
+
+#define HASH_SPACE2DEPTH_INTERNAL_KEY(_input0_type, _output_type, _opt_flg) \
+    ((_input0_type << 24) | (_output_type << 16) | (_opt_flg << 8))
+
+#define HASH_SPACE2DEPTH_INTERNAL_CL_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.space2depth_internal_"#SRC0_TYPE"to"#DST_TYPE)
+
+#define HASH_SPACE2DEPTH_INTERNAL_X2Y1_CL_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.space2depth_internal_"#SRC0_TYPE"to"#DST_TYPE"_X2Y1")
+
+#define TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_SPACE2DEPTH_INTERNAL_CL_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+ #define TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        HASH_SPACE2DEPTH_INTERNAL_X2Y1_CL_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } kernel_map[] =
+{
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(F32, F32,  KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(U8, U8, KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(F32, F32,  KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(U8, U8, KERNEL_SOURCE_1)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _CL_PARAM_NUM          _cnt_of_array(kernel_param_def)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_status    status             = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_int_array_t * in_shape = NULL;
+    int32_t width = 0;
+    int32_t height = 0;
+    int32_t chn = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    in_shape  = attr[0]->shape;
+    width = in_shape->data[0];
+    height = in_shape->data[1];
+    chn = in_shape->size > 2 ? in_shape->data[2] : 1;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = height;
+    gpu_param.global_size[2]   = chn;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+
+    return status;
+} /* _space2depth_internal_initializer() */
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    int32_t opt_flg
+    )
+{
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    vsi_status status = VSI_FAILURE;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    key = HASH_SPACE2DEPTH_INTERNAL_KEY( input0_dtype, output_dtype, opt_flg );
+
+    if (input0_dtype == F16 && output_dtype == F16)
+    {
+        input0_dtype = F32;
+        output_dtype = F32;
+    }
+
+    for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters = kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( kernel_param_def );
+        kernel->info.initialize = _space2depth_internal_initializer;
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t block_size_x  = vsi_nn_kernel_param_get_int32( params, "block_size_x" );
+    int32_t block_size_y  = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
+    int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0;
+
+    float inputScale = inputs[0]->attr.dtype.scale;
+    int32_t inputZp = inputs[0]->attr.dtype.zero_point;
+    float outputScale = outputs[0]->attr.dtype.scale;
+    int32_t outputZp = outputs[0]->attr.dtype.zero_point;
+    float scaleInOut = 1.0f;
+    float zpInOut = 0.0f;
+
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        int32_t input_fl = inputs[0]->attr.dtype.fl;
+        if (input_fl > 0)
+        {
+            inputScale = (1.0f / ((float) ((int64_t)1 << input_fl)));
+        }
+        else
+        {
+            inputScale = ((float) ((int64_t)1 << -input_fl));
+        }
+        inputZp = 0;
+    }
+    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
+    {
+        inputScale = 1.0f;
+        inputZp = 0;
+    }
+
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        int32_t output_fl = outputs[0]->attr.dtype.fl;
+        if (output_fl > 0)
+        {
+            outputScale = (1.0f / ((float) ((int64_t)1 << output_fl)));
+        }
+        else
+        {
+            outputScale = ((float) ((int64_t)1 << -output_fl));
+        }
+        outputZp = 0;
+    }
+    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
+    {
+        outputScale = 1.0f;
+        outputZp = 0;
+    }
+    scaleInOut = inputScale / outputScale;
+    zpInOut = outputZp - inputZp * scaleInOut;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, opt_flg);
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+
+        if ( node )
+        {
+            int32_t index = 2;
+            vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
+                    inputs, 1, outputs, 1 );
+            node_params[index++] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &block_size_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &block_size_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &scaleInOut );
+            node_params[index] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &zpInOut );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( space2depth_internal, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
@ -173,7 +173,7 @@ final:
        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
    }
    return status;
-} /* _pre_process_yuv420_exec() */
+} /* _instance_norm_exec() */
 /*
 * Kernel params
 */
--- a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
@ -0,0 +1,255 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "client/vsi_nn_vxkernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (1)
+#define _CPU_INPUT_NUM          (3)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.layer_norm")
+
+DEF_KERNEL_EXECUTOR(_layer_norm_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[_CPU_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    uint32_t i = 0;
+    float eps = .0f;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
+    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
+    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
+
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
+
+    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final );
+
+    buffer[3] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
+    memset( buffer[3], 0, out_elements * sizeof(float) );
+
+    {
+        uint32_t  axis_first = 0;
+        uint32_t  axis_num  = 1;
+        uint32_t  outerSize = 1;
+        uint32_t  axisSize  = 1;
+        uint32_t  innerSize = 1;
+        uint32_t  inner     = 0;
+        uint32_t  outer     = 0;
+
+        for (i = 0; i < (uint32_t)axis_first; i++)
+        {
+            innerSize *= attr[0]->shape->data[i];
+        }
+
+        for(i = 0; i < (uint32_t)axis_num; i++)
+        {
+            axisSize *= attr[0]->shape->data[axis_first + i];
+        }
+
+        for (i = (uint32_t)axis_first + axis_num; i < attr[0]->shape->size; i++)
+        {
+            outerSize *= attr[0]->shape->data[i];
+        }
+
+        for ( outer = 0; outer < outerSize; ++outer)
+        {
+            for ( inner = 0; inner < innerSize; ++inner)
+            {
+                float sum = .0f;
+                float sumsq = .0f;
+                float mean = .0f;
+                float vari = .0f;
+
+                for (i = 0; i < (uint32_t)axisSize; ++i)
+                {
+                    float value = buffer[0][(outer * axisSize + i) * innerSize + inner];
+                    sum += value;
+                    sumsq += (value * value);
+                }
+                mean = sum / (axisSize);
+                vari = sumsq / (axisSize) - mean * mean;
+                vari = (float)(1.0 / sqrtf(vari + eps));
+
+                for (i = 0; i < (uint32_t)axisSize; ++i)
+                {
+                    int idx = (outer * axisSize + i) * innerSize + inner;
+                    float data = buffer[0][idx] - mean;
+                    float scaleVal = buffer[2][idx];
+                    float biasVal = buffer[1][idx];
+                    float normVal = data * vari * scaleVal + biasVal;
+                    buffer[3][idx] = normVal;
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
+            buffer[3], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+    return status;
+} /* _layer_norm_exec() */
+/*
+ * Kernel params
+ */
+static vx_param_description_t _layer_normalization_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _LAYER_NORMALIZATION_PARAM_NUM  _cnt_of_array( _layer_normalization_kernel_param_def )
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    _layer_norm_exec,
+    _layer_normalization_kernel_param_def,
+    _LAYER_NORMALIZATION_PARAM_NUM,
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    vsi_nn_KernelInitializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
+                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+            backend_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            CHECK_STATUS( status );
+            vsi_nn_kernel_scalar_release( &backend_params[4] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( layer_norm, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
@ -0,0 +1,378 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.roi_align")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _roi_align_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _ROI_ALIGN_PARAM_NUM  _cnt_of_array( _roi_align_kernel_param_def )
+#define SCALAR_X_RATIO          (4)
+#define SCALAR_Y_RATIO          (5)
+#define SCALAR_X_SAMPLE         (6)
+#define SCALAR_Y_SAMPLE         (7)
+
+/*
+ * Kernel function
+ */
+static float _compute_region_coordinate(int32_t p, float bin_size, float roi_anchor, float max_value)
+{
+    const float region_start = p * bin_size + roi_anchor;
+
+    return vsi_nn_clamp(region_start, 0.0f, max_value - 1);
+}
+
+static float _roi_align_1x1(float *input_ptr,
+                           int32_t width,
+                           int32_t height,
+                           float   region_start_x,
+                           float   bin_size_x,
+                           int32_t grid_size_x,
+                           float   region_end_x,
+                           float   region_start_y,
+                           float   bin_size_y,
+                           int32_t grid_size_y,
+                           float   region_end_y)
+{
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return 0;
+    }
+    else
+    {
+        float avg = 0;
+        int32_t iy = 0;
+        int32_t ix = 0;
+        // Iterate through the aligned pooling region
+        for (iy = 0; iy < grid_size_y; ++iy)
+        {
+            for (ix = 0; ix < grid_size_x; ++ix)
+            {
+                // Align the window in the middle of every bin
+                float y = region_start_y + ((float)iy + 0.5f) * bin_size_y / (float)(grid_size_y);
+                float x = region_start_x + ((float)ix + 0.5f) * bin_size_x / (float)(grid_size_x);
+
+                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
+                const int32_t y_low  = (int32_t)y;
+                const int32_t x_low  = (int32_t)x;
+                const int32_t y_high = vsi_nn_min(y_low + 1, height - 1);
+                const int32_t x_high = vsi_nn_min(x_low + 1, width - 1);
+
+                const float ly = y - y_low;
+                const float lx = x - x_low;
+                const float hy = 1.0f - ly;
+                const float hx = 1.0f - lx;
+
+                const float w1 = hy * hx;
+                const float w2 = hy * lx;
+                const float w3 = ly * hx;
+                const float w4 = ly * lx;
+
+                const float data1 = *(input_ptr + y_low * width + x_low);
+                const float data2 = *(input_ptr + y_low * width + x_high);
+                const float data3 = *(input_ptr + y_high * width + x_low);
+                const float data4 = *(input_ptr + y_high * width + x_high);
+
+                avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+            }
+        }
+
+        avg /= grid_size_x * grid_size_y;
+
+        return avg;
+    }
+}
+
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t  i                 = 0;
+    float     width_scale       = 0.0f;
+    float     height_scale      = 0.0f;
+    float     width_ratio       = 0.0f;
+    float     height_ratio      = 0.0f;
+    int32_t   width_sample_num  = 0;
+    int32_t   height_sample_num = 0;
+    uint32_t  n                 = 0;
+    uint32_t  num_rois          = 0;
+    int32_t   inHeight          = 0;
+    int32_t   inWidth           = 0;
+    int32_t   inDepth           = 0;
+    int32_t   outHeight         = 0;
+    int32_t   outWidth          = 0;
+    uint32_t  kRoiDim           = 4;
+    uint32_t  out_index         = 0;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+
+    }
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_X_RATIO], &(width_ratio));
+    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_Y_RATIO], &(height_ratio));
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_X_SAMPLE], &(width_sample_num));
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_Y_SAMPLE], &(height_sample_num));
+
+    width_scale = 1.0f / width_ratio;
+    height_scale = 1.0f / height_ratio;
+    num_rois = in_attr[1]->shape->data[1];
+
+    inWidth = in_attr[0]->shape->data[0];
+    inHeight = in_attr[0]->shape->data[1];
+    inDepth = in_attr[0]->shape->data[2];
+    outWidth = out_attr[0]->shape->data[0];
+    outHeight = out_attr[0]->shape->data[1];
+
+    for (n = 0; n < num_rois; n++)
+    {
+        uint32_t batchId = (uint32_t)f32_in_buffer[2][n];
+        float scale = (in_attr[1]->dtype == U16) ? 0.125f : 1.0f;
+        float qx1 = f32_in_buffer[1][n * kRoiDim];
+        float qy1 = f32_in_buffer[1][n * kRoiDim + 1];
+        float qx2 = f32_in_buffer[1][n * kRoiDim + 2];
+        float qy2 = f32_in_buffer[1][n * kRoiDim + 3];
+
+        float x1 = qx1 * scale;
+        float x2 = qx2 * scale;
+        float y1 = qy1 * scale;
+        float y2 = qy2 * scale;
+        float roi_anchor_x = x1 * width_scale;
+        float roi_anchor_y = y1 * height_scale;
+        float roi_dims_x   = vsi_nn_max((x2 - x1) * width_scale, 1.0f);
+        float roi_dims_y   = vsi_nn_max((y2 - y1) * height_scale, 1.0f);
+        float bin_size_x   = roi_dims_x / outWidth;
+        float bin_size_y   = roi_dims_y / outHeight;
+
+        int32_t batch_base_index = batchId * inHeight * inWidth * inDepth;
+        int32_t ch = 0;
+        int32_t py = 0;
+        int32_t px = 0;
+
+        for (ch = 0; ch < inDepth; ch++)
+        {
+            for (py = 0; py < outHeight; py++)
+            {
+                for (px = 0; px < outWidth; px++)
+                {
+                    float region_start_x = _compute_region_coordinate(px, bin_size_x,
+                        roi_anchor_x, (float)inWidth);
+                    float region_start_y = _compute_region_coordinate(py, bin_size_y,
+                        roi_anchor_y, (float)inHeight);
+                    float region_end_x   = _compute_region_coordinate(px + 1, bin_size_x,
+                        roi_anchor_x, (float)inWidth);
+                    float region_end_y   = _compute_region_coordinate(py + 1, bin_size_y,
+                        roi_anchor_y, (float)inHeight);
+
+                    int32_t roi_bin_grid_x = (width_sample_num > 0) ? width_sample_num : (int32_t)(ceil(bin_size_x));
+                    int32_t roi_bin_grid_y = (height_sample_num > 0) ? height_sample_num : (int32_t)(ceil(bin_size_y));
+
+                    float *input_ptr = &f32_in_buffer[0][batch_base_index + ch * inWidth * inHeight];
+                    float out_val = 0;
+
+                    out_val = _roi_align_1x1(
+                        input_ptr, inWidth, inHeight, region_start_x, bin_size_x,
+                        roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
+                        roi_bin_grid_y, region_end_y);
+
+                    f32_out_buffer[0][out_index++] = out_val;
+                }
+            }
+        }
+    }
+
+    /* save data */
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _roi_align_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _roi_align_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ROI_ALIGN_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    float   width_ratio         = vsi_nn_kernel_param_get_float32( params, "width_ratio" );
+    float   height_ratio        = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
+    int32_t width_sample_num    = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
+    int32_t height_sample_num   = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_X_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &width_ratio );
+            node_params[SCALAR_Y_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &height_ratio );
+            node_params[SCALAR_X_SAMPLE] = vsi_nn_kernel_scalar_create( graph, I32, &width_sample_num );
+            node_params[SCALAR_Y_SAMPLE] = vsi_nn_kernel_scalar_create( graph, I32, &height_sample_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ROI_ALIGN_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_X_RATIO] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_Y_RATIO] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_X_SAMPLE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_Y_SAMPLE] );
+        }
+    }
+
+    return node;
+
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( roi_align, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
@ -0,0 +1,230 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "client/vsi_nn_vxkernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (2)
+#define _CPU_INPUT_NUM          (1)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.space2depth_internal")
+
+DEF_KERNEL_EXECUTOR(_space2depth_internal_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[2] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    uint32_t i = 0;
+    int32_t block_size_x = 1;
+    int32_t block_size_y = 1;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size_x);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size_y);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
+    memset( buffer[1], 0, out_elements * sizeof(float) );
+
+    {
+        uint32_t output_depth = attr[1]->shape->data[2];
+        uint32_t output_height = attr[1]->shape->data[1];
+        uint32_t output_width = attr[1]->shape->data[0];
+        uint32_t input_batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
+        uint32_t input_depth = attr[0]->shape->data[2];
+        uint32_t input_height = attr[0]->shape->data[1];
+        uint32_t input_width = attr[0]->shape->data[0];
+        uint32_t batch = 0, in_h = 0, in_w = 0;
+
+        for (batch = 0; batch < input_batch; ++ batch)
+        {
+            uint32_t output_batch_index = batch * output_height * output_width * output_depth;
+            uint32_t input_batch_index = batch * input_height * input_width * input_depth;
+            uint32_t in_d = 0;
+
+            for (in_d = 0; in_d < input_depth; in_d ++)
+            {
+                for (in_h = 0; in_h < input_height; ++ in_h)
+                {
+                    for (in_w = 0; in_w < input_width; in_w ++)
+                    {
+                        uint32_t out_w = in_w / block_size_x;
+                        uint32_t out_h = in_h / block_size_y;
+                        uint32_t out_d = (in_w  % block_size_x) * input_depth
+                                            + (in_h % block_size_y) * block_size_x * input_depth + in_d;
+
+                        uint32_t in_index = in_w + in_h * input_width
+                                            + in_d * input_height * input_width + input_batch_index;
+                        uint32_t out_index = out_w + out_h * output_width
+                                            +  out_d * output_width * output_height + output_batch_index;
+
+                        buffer[1][out_index] = buffer[0][in_index];
+                    }
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer[1], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    return status;
+} /* _depth2space_crd_exec() */
+/*
+ * Kernel params
+ */
+static vx_param_description_t _space2depth_internal_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _DEPTH2SPACE_CRD_PARAM_NUM  _cnt_of_array( _space2depth_internal_kernel_param_def )
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    _space2depth_internal_exec,
+    _space2depth_internal_kernel_param_def,
+    _cnt_of_array( _space2depth_internal_kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    vsi_nn_KernelInitializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+            int32_t block_size_x  = vsi_nn_kernel_param_get_int32( params, "block_size_x" );
+            int32_t block_size_y  = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
+                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_x );
+            backend_params[index] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_y );
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            CHECK_STATUS( status );
+            vsi_nn_kernel_scalar_release( &backend_params[2] );
+            vsi_nn_kernel_scalar_release( &backend_params[3] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( space2depth_internal, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c
@ -0,0 +1,264 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.upsamplescale")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _upsamplescale_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _UPSAMPLESCALE_PARAM_NUM  _cnt_of_array( _upsamplescale_kernel_param_def )
+
+#define SCALAR_STRIDE_VALUE          (2)
+#define SCALAR_SCALE_VALUE           (3)
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    int32_t  i = 0;
+    int32_t  stride = 0;
+    float    scale = 0.0f;
+    int32_t width = 0;
+    int32_t height = 0;
+    int32_t out_width = 0;
+    int32_t out_height = 0;
+    int32_t outerSize = 1;
+    int32_t x = 0;
+    int32_t y = 0;
+
+    /* prepare data */
+    for(i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+
+    }
+    for(i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_VALUE], &stride);
+    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_VALUE], &scale);
+
+    width = in_attr[0]->shape->data[0];
+    height = in_attr[0]->shape->data[1];
+    for (i = 2; i < (int32_t)in_attr[0]->shape->size; i++)
+    {
+        outerSize *= in_attr[0]->shape->data[i];
+    }
+
+    out_width = out_attr[0]->shape->data[0];
+    out_height = out_attr[0]->shape->data[1];
+
+    for (i = 0; i < outerSize; i++)
+    {
+        for (y = 0; y < height; y++)
+        {
+            for (x = 0; x < width; x++)
+            {
+                int32_t in_idx = i * width * height + y * width + x;
+                int32_t base_idx = i * out_width * out_height
+                    + y * stride * out_width + x * stride;
+                int32_t dx = 0;
+                int32_t dy = 0;
+                float data = f32_in_buffer[0][in_idx] * scale;
+
+                for (dy = 0; dy < stride; dy++)
+                {
+                    for (dx = 0; dx < stride; dx++)
+                    {
+                        int32_t idx = base_idx + dy * out_width + dx;
+
+                        f32_out_buffer[0][idx] = data;
+                    }
+                }
+
+            }
+        }
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _upsamplescale_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _upsamplescale_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_UPSAMPLESCALE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t stride = 0;
+    float scale = 1.0f;
+
+    stride = vsi_nn_kernel_param_get_int32(params, "stride");
+    scale = vsi_nn_kernel_param_get_float32(params, "scale");
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLESCALE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+
+            node_params[SCALAR_STRIDE_VALUE] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &stride );
+            node_params[SCALAR_SCALE_VALUE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &scale );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLESCALE_PARAM_NUM );
+
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_VALUE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( upsamplescale, _setup )
+
--- a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c
@ -79,8 +79,10 @@ typedef struct
 static const _kernel_map_type _a_times_b_plus_c_kernel_map[] =
 {
    PACK_KERNEL_MAP(F16, F16,  F16,  F16),
+    PACK_KERNEL_MAP(F16, F16,  F32,  F16),

    PACK_KERNEL_MAP_2D(F16, F16, F16, F16),
+    PACK_KERNEL_MAP_2D(F16, F16,  F32,  F16),
 };

 /*
@ -106,7 +108,7 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer)
    )
 {
 #define _PACK_A_TIMES_B_PLUS_C_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE )    \
-        (( IN1_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE))
+        (( IN2_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE))
    vsi_status status = VX_SUCCESS;
    // Alignment with a power of two value.
    gpu_param_t gpu_param = {
@ -183,6 +185,48 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer)
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        break;
+        case _PACK_A_TIMES_B_PLUS_C_KEY( F16, F16, F32, F16 ):
+        {
+            gpu_dp_inst_t uniA_Times_B_lo_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x01010101, // BSelt
+                0x00010000, 0x00030002, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniA_Times_B_hi_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x01010101, // BSelt
+                0x00050004, 0x00070006, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniA_Times_B_lo_4x4", &uniA_Times_B_lo_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniA_Times_B_hi_4x4", &uniA_Times_B_hi_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
    default:
        break;
    }
@ -223,13 +267,13 @@ static vsi_status _query_kernel
    vx_param_description_t * param_def  = _a_times_b_plus_c_kernel_param_def;
    size_t param_def_size               = _cnt_of_array( _a_times_b_plus_c_kernel_param_def );
    vx_kernel_initialize_f  initializer = _a_times_b_plus_c_initializer;
-    uint32_t key;
-    uint32_t i;
+    uint32_t key = 0;
+    uint32_t i = 0;

    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    in1_dtype   = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
-    in2_dtype   = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
-    out_dtype   = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    in2_dtype  = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    out_dtype  = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

    key = A_TIMES_B_PLUS_C_HASH_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d);

--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@ -53,18 +53,34 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_GATHER_U8TOF16      CVIVANTE_NAMESPACE("evis.gather_U8toF16")
 #define VX_KERNEL_NAME_GATHER_F16TOU8      CVIVANTE_NAMESPACE("evis.gather_F16toU8")

+#define VX_KERNEL_NAME_GATHER_AXIS0_U8TOU8    CVIVANTE_NAMESPACE("evis.gather_U8toU8_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_I8TOI8    CVIVANTE_NAMESPACE("evis.gather_I8toI8_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_I16TOI16  CVIVANTE_NAMESPACE("evis.gather_I16toI16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOF16  CVIVANTE_NAMESPACE("evis.gather_F16toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_I8TOF16   CVIVANTE_NAMESPACE("evis.gather_I8toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_I16TOF16  CVIVANTE_NAMESPACE("evis.gather_I16toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOI8   CVIVANTE_NAMESPACE("evis.gather_F16toI8_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOI16  CVIVANTE_NAMESPACE("evis.gather_F16toI16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16   CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8   CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0")
+
 #define KERNEL_SOURCE_1    "gather"
 #define KERNEL_SOURCE_2    "gather_mix"

 // Add kernel hashtable here
-#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _quant_type) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_quant_type))
+#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0))

 #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \
        VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
        SOURCE },

+#define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1), \
+        VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
 static const struct {
        uint32_t key;
        char* function_name;
@ -81,6 +97,16 @@ static const struct {
    TENSOR_GATHER_KERNELS(F16, I32, I16,       KERNEL_SOURCE_2)
    TENSOR_GATHER_KERNELS(U8, I32,  F16,       KERNEL_SOURCE_2)
    TENSOR_GATHER_KERNELS(F16, I32, U8,        KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(U8, I32,  U8,    KERNEL_SOURCE_1)
+    TENSOR_GATHER_AXIS0_KERNELS(I8, I32,  I8,    KERNEL_SOURCE_1)
+    TENSOR_GATHER_AXIS0_KERNELS(I16, I32, I16,   KERNEL_SOURCE_1)
+    TENSOR_GATHER_AXIS0_KERNELS(F16, I32, F16,   KERNEL_SOURCE_1)
+    TENSOR_GATHER_AXIS0_KERNELS(I8, I32,  F16,   KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(I16, I32, F16,   KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I8,    KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I16,   KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(U8, I32,  F16,   KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(F16, I32, U8,    KERNEL_SOURCE_2)
 };

 /*
@ -123,7 +149,7 @@ static vsi_status get_gather_tensor_reshape_size
        sizes[i] = 1;
    }

-    if(idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
+    if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
    {
        sizes[0] = elementCnt;
        sizes[1] = 1;
@ -131,7 +157,7 @@ static vsi_status get_gather_tensor_reshape_size
    }
    else
    {
-        if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
        {
            sizes[0] = block_size;
            sizes[1] = elementCnt / block_size;
@ -191,7 +217,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
    src0Scale  = attr[0]->asymm.scale;
    dstZP      = attr[2]->asymm.zero_point;
    dstScale   = attr[2]->asymm.scale;
-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        if (attr[0]->dfp.fl > 0)
        {
@ -202,12 +228,12 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
        }
    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
    {
        src0Scale = 1;
    }

-    if( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        if (attr[2]->dfp.fl > 0)
        {
@ -219,7 +245,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
        }
        dstScale = 1.0f/dstScale;
    }
-    else if( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
    {
        dstScale = 1;
    }
@ -232,7 +258,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
    }

    shaderParam.global_scale[0]  = 16;
-    if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
    {
        shaderParam.global_scale[0]  = 8;
    }
@ -340,6 +366,214 @@ OnError:
    return status;
 }

+DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    int32_t       block_num   = 0;
+    int32_t       indices_num = 1;
+    uint32_t      input_dims1 = 0;
+    vx_uint32     i           = 0;
+    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
+    vsi_int_array_t * input1_shape = NULL;
+    int32_t     src0ZP     = 0;
+    float       src0Scale  = 0;
+    int32_t     dstZP      = 0;
+    float       dstScale   = 0;
+
+    uint32_t pack_key = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    src0ZP     = attr[0]->asymm.zero_point;
+    src0Scale  = attr[0]->asymm.scale;
+    dstZP      = attr[2]->asymm.zero_point;
+    dstScale   = attr[2]->asymm.scale;
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+        }
+        else
+        {
+            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+        }
+    }
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    {
+        src0Scale = 1;
+    }
+
+    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[2]->dfp.fl > 0)
+        {
+            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
+        }
+        else
+        {
+            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
+        }
+        dstScale = 1.0f/dstScale;
+    }
+    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    {
+        dstScale = 1;
+    }
+
+    input1_shape  = attr[1]->shape;
+    input_dims1   = (uint32_t)input1_shape->size;
+    for (i = 0; i < input_dims1; i++)
+    {
+        indices_num *= input1_shape->data[i];
+    }
+
+    shaderParam.global_scale[0]  = 4;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((indices_num + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = block_num;
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE)    \
+        (IN0_TYPE | (OUT_TYPE << 8))
+
+    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype);
+
+    {
+        uint16_t M0               = 0;
+        int32_t  postShift        = 0;
+        uint32_t multAndoutZP0[2] = {0};
+        uint32_t multAndoutZP1[2] = {0};
+        gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertFp16toU8_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniExtraCopyDpKeepinEvis_2x8 = {{
+            0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        switch( pack_key )
+        {
+        case _PACK_SELECT_KEY( U8, F16):
+        case _PACK_SELECT_KEY( I8, F16):
+        case _PACK_SELECT_KEY( I16, F16):
+            {
+                gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
+                multAndoutZP0[0] = (uint32_t)(M0);
+                multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0);
+
+                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
+                status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( F16, U8):
+        case _PACK_SELECT_KEY( F16, I8):
+        case _PACK_SELECT_KEY( F16, I16):
+            {
+                int32_t  postShift0       = 0;
+                gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
+
+                multAndoutZP1[0] = (uint32_t)(M0);
+                multAndoutZP1[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
+
+                gpu_dp_inst_update_postshfit( &uniConvertFp16toU8_2x8, postShift0 );
+                status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvertFp16toU8_2x8", &uniConvertFp16toU8_2x8 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( I16, I16):
+        case _PACK_SELECT_KEY( I8,  I8):
+        case _PACK_SELECT_KEY( U8,  U8):
+        case _PACK_SELECT_KEY( F16, F16):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtraCopyDpKeepinEvis_2x8", &uniExtraCopyDpKeepinEvis_2x8 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+#undef _PACK_SELECT_KEY
+
+    status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }
+
+    return status;
+}
+
 /*
 * Query kernel
 */
@ -348,7 +582,8 @@ static vsi_status _query_kernel
    vsi_nn_tensor_t* const* const inputs,
    vsi_nn_tensor_t* const* const outputs,
    vsi_nn_kernel_t* kernel,
-    const vsi_nn_kernel_param_t * params
+    const vsi_nn_kernel_param_t * params,
+    int32_t axis
    )
 {
    vsi_status status = VSI_FAILURE;
@ -360,21 +595,28 @@ static vsi_status _query_kernel
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0 );
+    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis );

    for( i = 0; i < _cnt_of_array(gather_map); i ++ )
    {
-        if( gather_map[i].key == key )
+        if ( gather_map[i].key == key )
        {
            break;
        }
    }
-    if( i < _cnt_of_array(gather_map) )
+    if ( i < _cnt_of_array(gather_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  gather_map[i].function_name );
        kernel->info.parameters = _gather_kernel_param_def;
        kernel->info.numParams = _cnt_of_array( _gather_kernel_param_def );
-        kernel->info.initialize = _gather_initializer;
+        if (axis)
+        {
+            kernel->info.initialize = _gather_axis0_initializer;
+        }
+        else
+        {
+            kernel->info.initialize = _gather_initializer;
+        }

        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                "vsi_nn_kernel_header",
@ -405,26 +647,39 @@ static vsi_nn_kernel_node_t _setup
    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
    int32_t block_num   = vsi_nn_kernel_param_get_int32( params, "block_num" );
    int32_t axis_num    = vsi_nn_kernel_param_get_int32( params, "axis_num" );
+    int32_t axis        = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t axis0_flg   = 0;

-    status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
-    status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
-    status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
-    if(status != VSI_SUCCESS)
+    if (axis == 0)
+    {
+        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0);
+        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
+        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0);
+        axis0_flg = 1;
+    }
+    else
+    {
+        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
+        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
+        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
+        axis0_flg = 0;
+    }
+    if (status != VSI_SUCCESS)
    {
        return NULL;
    }

-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
    }

-    status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( inputs, outputs, kernel, params, axis0_flg);
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            uint32_t index = 0;
 #define RESHAPE_DIM 2
--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
@ -183,7 +183,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)

    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
    vsi_int_array_t * input_shape = NULL;
-    float scaleIn = 0;
+    float scaleIn = 1;
    int32_t input_zp = 0;
    vx_uint32 iter = 0;
    int32_t sumInZp = 0;
@ -206,10 +206,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    input_shape  = attr[0]->shape;
-    input_zp     = attr[0]->asymm.zero_point;
-    scaleIn      = attr[0]->asymm.scale;

-    if(attr[0]->dtype == I8 || attr[0]->dtype == I16)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        input_zp     = attr[0]->asymm.zero_point;
+        scaleIn      = attr[0]->asymm.scale;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
        if (attr[0]->dfp.fl > 0)
        {
@ -225,13 +228,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
    width = input_shape->data[0];
    height = input_shape->data[1];
    chn = attr[1]->shape->data[1];
-    if(rsFlg)
+    if (rsFlg)
    {
        height = height / chn;
    }
    iter = height * 16;

-    if(attr[0]->dtype == U8)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
        sumInZp = input_zp * iter * (-1);
        tmpZp1 = (-2) * input_zp;
@ -247,11 +250,11 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
    shaderParam.local_size[1]  = 1;
    shaderParam.local_size[2]  = 1;

-    if(attr[0]->dtype == I8 || attr[0]->dtype == U8)
+    if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
    {
        shaderParam.global_size[0]   = (width + 255) / 256 * 16;
    }
-    else if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
    {
        shaderParam.global_size[0]   = (width + 127) / 128 * 16;
    }
@ -261,7 +264,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
    CHECK_STATUS_FAIL_GOTO(status, OnError);

-    if(attr[0]->dtype == U8)
+    if (attr[0]->dtype == U8)
    {
        gpu_dp_inst_t uniSumU8_16x1 = {{
            0x55555555, // TCfg
@ -290,7 +293,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
        status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
        CHECK_STATUS_FAIL_GOTO(status, OnError );
    }
-    else if(attr[0]->dtype == I8)
+    else if (attr[0]->dtype == I8)
    {
        gpu_dp_inst_t uniSumInt8_16x1 = {{
            0x55555555, // TCfg
@ -317,7 +320,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
        status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
        CHECK_STATUS_FAIL_GOTO(status, OnError );
    }
-    else if(attr[0]->dtype == I16)
+    else if (attr[0]->dtype == I16)
    {
        gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
            0x55555555, // TCfg
@ -333,7 +336,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
        status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
        CHECK_STATUS_FAIL_GOTO(status, OnError );
    }
-    else if(attr[0]->dtype == F16)
+    else if (attr[0]->dtype == F16)
    {
        gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
            0x55555555, // TCfg
@ -384,10 +387,10 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)

    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
    vsi_int_array_t * input_shape = NULL;
-    float scaleIn = 0;
-    float scaleOut = 0;
-    float reScaleOut_u8 = 0;
-    float scale_inOut = 0;
+    float scaleIn = 1.0f;
+    float scaleOut = 1.0f;
+    float reScaleOut_u8 = 1.0f;
+    float scale_inOut = 1.0f;
    int32_t output_zp = 0;
    int32_t input_zp = 0;
    float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1;
@ -407,12 +410,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    input_shape  = attr[0]->shape;
-    input_zp     = attr[0]->asymm.zero_point;
-    scaleIn      = attr[0]->asymm.scale;
-    output_zp    = attr[2]->asymm.zero_point;
-    scaleOut     = attr[2]->asymm.scale;

-    if(attr[0]->dtype == I8 || attr[0]->dtype == I16)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        input_zp     = attr[0]->asymm.zero_point;
+        scaleIn      = attr[0]->asymm.scale;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
        if (attr[0]->dfp.fl > 0)
        {
@ -422,9 +426,16 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
        {
            in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
        }
+        input_zp = 0;
    }

-    if(attr[2]->dtype == I8 || attr[2]->dtype == I16)
+    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        output_zp    = attr[2]->asymm.zero_point;
+        scaleOut     = attr[2]->asymm.scale;
+        reScaleOut_u8 = 1 / scaleOut;
+    }
+    else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
        if (attr[2]->dfp.fl > 0)
        {
@ -434,10 +445,11 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
        {
            out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
        }
+        output_zp = 0;
    }

-    if((attr[2]->dtype == I8 || attr[2]->dtype == I16)
-        && (attr[0]->dtype == I8 || attr[0]->dtype == I16))
+    if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+        && (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP))
    {
        inOut_fl_scale = in_scale_fl * out_scale_fl;
    }
@ -445,21 +457,17 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
    width = input_shape->data[0];
    height = input_shape->data[1];
    chn = attr[1]->shape->data[1];
-    if(rsFlg)
+    if (rsFlg)
    {
        height = height / chn;
    }

-    if(attr[2]->dtype == U8)
-    {
-        reScaleOut_u8 = 1 / scaleOut;
-    }
    dimRatio = (float)(1.0 / (width * height));

    group_num = (width + 255) / 256;

    shaderParam.global_scale[0]  = 16;
-    if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
    {
        shaderParam.global_scale[0]  = 8;
        group_num = (width + 127) / 128;
@ -774,12 +782,12 @@ static vsi_status _query_kernel

    for( i = 0; i < kernel_map_size; i ++ )
    {
-        if( kernel_map[i].key == hashkey )
+        if ( kernel_map[i].key == hashkey )
        {
            break;
        }
    }
-    if( i < kernel_map_size )
+    if ( i < kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
@ -830,7 +838,7 @@ static vsi_nn_kernel_node_t _setup
    int32_t reshape_flg  = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );

    // Check if gpu can support the size
-    if( !vsi_nn_kernel_gpu_check_shape(
+    if ( !vsi_nn_kernel_gpu_check_shape(
        (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
    {
        return NULL;
@ -850,7 +858,7 @@ static vsi_nn_kernel_node_t _setup

    attr.size[0] = ((inputs[0]->attr.size[0] + 255) / 256) * 4;

-    if( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
    {
        attr.size[0] = ((inputs[0]->attr.size[0] + 127) / 128) * 4;
@ -868,17 +876,17 @@ static vsi_nn_kernel_node_t _setup
    hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );

    status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
-    if( VSI_SUCCESS != status )
+    if ( VSI_SUCCESS != status )
    {
        goto final;
    }
    status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
-    if( VSI_SUCCESS != status )
+    if ( VSI_SUCCESS != status )
    {
        goto final;
    }

-    if(reshape_flg)
+    if (reshape_flg)
    {
        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
        shape[0] = inputs[0]->attr.size[0];
@ -893,7 +901,7 @@ static vsi_nn_kernel_node_t _setup
        shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
        rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
    }
-    if(inputs[1]->attr.dim_num < 2)
+    if (inputs[1]->attr.dim_num < 2)
    {
        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
        shape[0] = inputs[1]->attr.size[0];
@ -902,7 +910,7 @@ static vsi_nn_kernel_node_t _setup
        shape[3] = 1;
        rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
    }
-    if(inputs[2]->attr.dim_num < 2)
+    if (inputs[2]->attr.dim_num < 2)
    {
        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
        shape[0] = inputs[2]->attr.size[0];
@ -914,10 +922,10 @@ static vsi_nn_kernel_node_t _setup
    // Mean Vari
    {
        tmp_node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
-        if(tmp_node)
+        if (tmp_node)
        {
            uint32_t index = 0;
-            if(reshape_flg)
+            if (reshape_flg)
            {
                mean_vari_node_params[index++] = rs_input;
                vsi_nn_kernel_node_pack_io( &mean_vari_node_params[index],
@ -943,7 +951,7 @@ static vsi_nn_kernel_node_t _setup
                border.mode = VX_BORDER_CONSTANT;
                border.constant_value.U8 = 0;
                border.constant_value.U16 = 0;
-                if(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
                {
                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
                }
@ -956,10 +964,10 @@ static vsi_nn_kernel_node_t _setup
    // Nomalization
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if(node)
+        if (node)
        {
            uint32_t index = 0;
-            if(reshape_flg)
+            if (reshape_flg)
            {
                node_params[index++] = rs_input;
            }
@ -967,7 +975,7 @@ static vsi_nn_kernel_node_t _setup
            {
                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
            }
-            if(inputs[1]->attr.dim_num < 2)
+            if (inputs[1]->attr.dim_num < 2)
            {
                node_params[index++] = rs_beta;
            }
@ -975,7 +983,7 @@ static vsi_nn_kernel_node_t _setup
            {
                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
            }
-            if(inputs[2]->attr.dim_num < 2)
+            if (inputs[2]->attr.dim_num < 2)
            {
                node_params[index++] = rs_gamma;
            }
@ -984,7 +992,7 @@ static vsi_nn_kernel_node_t _setup
                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
            }
            node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
-            if(reshape_flg)
+            if (reshape_flg)
            {
                node_params[index++] = rs_output;
            }
@ -1006,9 +1014,9 @@ static vsi_nn_kernel_node_t _setup
                border.mode = VX_BORDER_CONSTANT;
                border.constant_value.U8 = 0;
                border.constant_value.U16 = 0;
-                if(outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
                {
-                    border.constant_value.U8 = (vx_uint8)outputs[0]->attr.dtype.zero_point;
+                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
                }
                status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
                CHECK_STATUS(status);
@ -1018,31 +1026,31 @@ static vsi_nn_kernel_node_t _setup

    /* Pass parameters to node. */
 final:
-    if(rs_beta)
+    if (rs_beta)
    {
        vsi_nn_kernel_tensor_release( &rs_beta );
    }
-    if(rs_gamma)
+    if (rs_gamma)
    {
        vsi_nn_kernel_tensor_release( &rs_gamma );
    }
-    if(reshape_flg)
+    if (reshape_flg)
    {
        vsi_nn_kernel_tensor_release( &rs_input );
        vsi_nn_kernel_tensor_release( &rs_output );
    }
    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
    {
-        if( ikernels[i] )
+        if ( ikernels[i] )
        {
            vsi_nn_kernel_release( &ikernels[i] );
        }
-        if( tensors[i] )
+        if ( tensors[i] )
        {
            vsi_nn_ReleaseTensor( &tensors[i] );
        }
    }
-    if(tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
+    if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
@ -68,7 +68,6 @@ static const struct {
 {
    TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_1)
    TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_1)
-    TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8,  SCALE_NHWC,   KERNEL_SOURCE_2)
 };

 static vx_param_description_t vxPreProcessBgraKernel_param_def[] =
@ -106,7 +105,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
    int32_t     dstZP      = 0;
    float       outputScale   = 1;
    int32_t     reorder    = 0;
-    int32_t     trans      = 0;
    int32_t     xRatio     = 0;
    int32_t     yRatio     = 0;
    int32_t     order1     = 2;
@ -126,8 +124,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
    dstZP      = attr[0]->asymm.zero_point;
@ -135,19 +131,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
    width      = out_shape->data[0];
    height     = out_shape->data[1];

-    if(trans)
-    {
-        width = width / 3;
-    }
-
-    if(reorder != 0)
+    if (reorder != 0)
    {
        reorder = 2;
        order1 = 0;
    }
    enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));

-    if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
        if (attr[0]->dfp.fl > 0)
        {
@ -159,11 +150,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
        }
        dstZP = 0;
    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
        outputScale = 1.0f/outputScale;
    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
    {
        outputScale = 1;
        dstZP = 0;
@ -286,16 +277,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };

-        gpu_dp_inst_t uniExtractInt32BgraToU8Bgr_2x8 = {{
-            0x00333333, // TCfg
-            0x00111000, // ASelt
-            0x00020100, 0x00000201, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
        // copy
        gpu_dp_inst_t uniExtractBfromBgra_4x4 = {{
            0x01010101, // TCfg
@ -355,23 +336,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16 };

-        if(trans)
-        {
-            status = vsi_nn_kernel_gpu_add_param(node, "uniExtractInt32BgraToU8Bgr_2x8",
-                        &uniExtractInt32BgraToU8Bgr_2x8);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp1BgraShort_4x4", &uniBilinearTmp1BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp2BgraShort_4x4", &uniBilinearTmp2BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp3BgraShort_4x4", &uniBilinearTmp3BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp4BgraShort_4x4", &uniBilinearTmp4BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp5BgraShort_4x4", &uniBilinearTmp5BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp6BgraShort_4x4", &uniBilinearTmp6BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp7BgraShort_4x4", &uniBilinearTmp7BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp8BgraShort_4x4", &uniBilinearTmp8BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniDescaleU8_4x4", &uniDescaleU8_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
-            CHECK_STATUS_FAIL_GOTO(status, OnError);
-        }
-        else if(enable_copy)
+        if (enable_copy)
        {
            status = vsi_nn_kernel_gpu_add_param(node, "uniExtractBfromBgra_4x4", &uniExtractBfromBgra_4x4);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGfromBgra_4x4", &uniExtractGfromBgra_4x4);
@ -429,16 +394,11 @@ static vsi_status _query_kernel
    uint32_t key = 0;
    int i = 0;
    vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
-    vsi_bool enable_perm  = vsi_nn_kernel_param_get_int32( params, "enable_perm" );

    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    if(enable_perm)
-    {
-        convert_type = SCALE_NHWC;
-    }
-    else if(enable_copy)
+    if (enable_copy)
    {
        convert_type = COPY;
    }
@ -449,14 +409,14 @@ static vsi_status _query_kernel

    key = HASH_PRE_PROCESS_BGRA_KEY( input0_dtype, output_dtype, convert_type, 0 );

-    for( i = 0; i < _cnt_of_array(pre_process_bgra_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(pre_process_bgra_map); i ++ )
    {
        if( pre_process_bgra_map[i].key == key )
        {
            break;
        }
    }
-    if( i < _cnt_of_array(pre_process_bgra_map) )
+    if ( i < _cnt_of_array(pre_process_bgra_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_bgra_map[i].function_name );
        kernel->info.parameters = vxPreProcessBgraKernel_param_def;
@ -488,19 +448,19 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_t node = NULL;
    int32_t shapes[VSI_NN_MAX_DIM_NUM]  = {1, 1, 1, 1};
    vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
-    int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
+    int32_t trans = 0;

-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
    }

    status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            uint32_t index = 2;
            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
@ -43,7 +43,6 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8")
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI8     CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI8")
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOU8      CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toU8")
-#define VX_KERNEL_NAME_PRE_PROCESS_NV12_TRANS_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_nv12_trans_U8toU8")

 // greater than a quarter
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8_GQ  CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8_gq")
@ -51,7 +50,6 @@ __BEGIN_DECLS

 #define KERNEL_SOURCE_1    "pre_process_nv12_scale_8bits",
 #define KERNEL_SOURCE_2    "pre_process_nv12_scale",
-#define KERNEL_SOURCE_3    "pre_process_nv12_trans_u8",
 #define KERNEL_SOURCE_4    "pre_process_nv12_scale_mix"

 typedef enum
@ -85,7 +83,6 @@ static const struct {
    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_1)
    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, F16, SCALE,        KERNEL_SOURCE_2)
    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I16, SCALE,        KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8,  TRANS,        KERNEL_SOURCE_3)
    TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, U8,  SCALE,     KERNEL_SOURCE_4)
    TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, F16, SCALE,     KERNEL_SOURCE_4)
 };
@ -156,17 +153,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
    width      = out_shape->data[0];
    height     = out_shape->data[1];

-    if(reorder != 0)
+    if (reorder != 0)
    {
        reorder = 2;
        order1 = 0;
    }

-    if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
        dstScale = 1.0f / dstScale;
    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
        if (attr[0]->dfp.fl > 0)
        {
@ -178,7 +175,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
        }
        dstZP = 0;
    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
    {
        dstScale = 1;
        dstZP = 0;
@ -295,7 +292,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
    int32_t     dstZP      = 0;
    float       dstScale   = 1;
    int32_t     reorder    = 0;
-    int32_t     trans      = 0;
    int32_t     order1     = 2;
    uint32_t    width      = 0;
    uint32_t    height     = 0;
@ -325,8 +321,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[1]->shape;
    dstZP      = attr[1]->asymm.zero_point;
@ -334,24 +328,21 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
    width      = out_shape->data[0];
    height     = out_shape->data[1];

-    if(reorder != 0)
+    if (reorder != 0)
    {
        reorder = 2;
        order1 = 0;
    }
-    if(trans)
-    {
-        width = width / 3;
-    }
+
    resize = (float)width / attr[0]->shape->data[0];
    xrIntFloat_16 = (attr[0]->shape->data[0] << 16) / width + 1;
    yrIntFloat_16 = (attr[0]->shape->data[1] << 16) / height + 1;

-    if(attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
        dstScale = 1.0f / dstScale;
    }
-    else if(attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
        if (attr[1]->dfp.fl > 0)
        {
@ -363,7 +354,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
        }
        dstZP = 0;
    }
-    else if(attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
+    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
    {
        dstScale = 1;
        dstZP = 0;
@ -450,27 +441,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
        }, GPU_DP_TYPE_16 };

        //trans
-        gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{
-            0x11311311, // TCfg
-            0x00100100, // ASelt
-            0x01000400, 0x06020105, // ABin
-            0x22022022, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000000, 0x00000001,
-            0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{
-            0x00003113, // TCfg
-            0x00001001, // ASelt
-            0x03070302, 0x00000000, // ABin
-            0x00000220, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000001, 0x00000001, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
        gpu_dp_inst_t uniCalculateYShift_2x8 = {{
            0x00009999, // TCfg
            0x00000000, // ASelt
@ -502,23 +472,15 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
        status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
        status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
        status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
-        if(resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16) && !trans)
+
+        if (resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16))
        {
            status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8);
        }
        CHECK_STATUS_FAIL_GOTO(status, OnError );

-        if(trans && attr[1]->dtype == U8)
-        {
-            status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8);
-            CHECK_STATUS_FAIL_GOTO(status, OnError );
-        }
-        else
-        {
-            status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
-        }
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
        status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
        CHECK_STATUS_FAIL_GOTO(status, OnError );

@ -572,20 +534,15 @@ static vsi_status _query_kernel
    uint32_t key = 0;
    int i = 0;
    vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
-    vsi_bool enable_perm  = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
    uint32_t srcWidth = inputs[0]->attr.size[0];
-    uint32_t dstWidth = enable_perm ? outputs[0]->attr.size[1] : outputs[0]->attr.size[0];
+    uint32_t dstWidth = outputs[0]->attr.size[0];
    float scaleVal = (float)dstWidth / srcWidth;
    uint32_t optFlg = 0;

    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    if(enable_perm)
-    {
-        convert_type = TRANS;
-    }
-    else if(enable_copy && output_dtype == U8)
+    if (enable_copy && output_dtype == U8)
    {
        convert_type = COPY;
    }
@ -594,7 +551,7 @@ static vsi_status _query_kernel
        convert_type = SCALE;
    }

-    if(scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE)
+    if (scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE)
    {
        optFlg = 1;
    }
@ -608,7 +565,7 @@ static vsi_status _query_kernel
            break;
        }
    }
-    if( i < _cnt_of_array(pre_process_nv12_map) )
+    if ( i < _cnt_of_array(pre_process_nv12_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_nv12_map[i].function_name );
        kernel->info.parameters = vxPreProcessNv12Kernel_param_def;
@ -646,21 +603,20 @@ static vsi_nn_kernel_node_t _setup
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_NV12_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
-    int32_t shapes[VSI_NN_MAX_DIM_NUM]  = {1, 1, 1, 1};
    vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
-    int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
+    int32_t trans = 0;

-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
    }

    status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            uint32_t index = 3;
            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@ -674,22 +630,9 @@ static vsi_nn_kernel_node_t _setup
            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );

            /* Pass parameters to node. */
-            if(trans)
-            {
-                shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
-                shapes[1] = outputs[0]->attr.size[2];
-
-                reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
-                    outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
-
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM,
-                    inputs, 2, &reshape_tensors[0], 1 );
-            }
-            else
-            {
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM,
+            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM,
                    inputs, 2, outputs, 1 );
-            }
+
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
@ -90,14 +90,6 @@ static const struct {
    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, COPY,         KERNEL_SOURCE_2)
    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_2)
    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8,  COPY,         KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, SCALE_NHWC,   KERNEL_SOURCE_3)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, SCALE_NHWC,   KERNEL_SOURCE_3)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8,  SCALE_NHWC,   KERNEL_SOURCE_3)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8,  SCALE_NHWC,   KERNEL_SOURCE_3)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, COPY_NHWC,    KERNEL_SOURCE_4)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, COPY_NHWC,    KERNEL_SOURCE_4)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8,  COPY_NHWC,    KERNEL_SOURCE_4)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8,  COPY_NHWC,    KERNEL_SOURCE_4)
 };

 static vx_param_description_t vxPreProcessRgbKernel_param_def[] =
@ -156,8 +148,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
    outputZP   = (float)attr[0]->asymm.zero_point;
@ -165,14 +155,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
    width      = out_shape->data[0];
    height     = out_shape->data[1];

-    if(reorder != 0)
+    if (reorder != 0)
    {
        reorder = 2;
        order1 = 0;
    }
    enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));

-    if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
        if (attr[0]->dfp.fl > 0)
        {
@ -184,11 +174,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
        }
        outputZP = 0;
    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
        outputScale = 1.0f / outputScale;
    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
    {
        outputScale = 1;
        outputZP = 0;
@ -199,48 +189,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)

    pack_key = _PACK_SELECT_KEY( enable_copy, reorder, trans);
    {
-        // trans and copy
-        gpu_dp_inst_t uniNormilizationLo_2x8 = {{
-            0x99999999, // TCfg
-            0x44444444, // ASelt
-            0x45002142, 0x27480324, // ABin
-            0x99999999, // BSelt
-            0x06060606, 0x06060606, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniNormilizationHi_2x8 = {{
-            0x09999999, // TCfg
-            0x04444444, // ASelt
-            0x092a4b06, 0x000c2d4e, // ABin
-            0x09999999, // BSelt
-            0x06060606, 0x00060606, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
-            0x3c000000, 0x3c000000, 0x3c000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniNormilizationLo_NHWC_2x8 = {{
-            0x99999999, // TCfg
-            0x44444444, // ASelt
-            0x03422100, 0x27064524, // ABin
-            0x99999999, // BSelt
-            0x06060606, 0x06060606, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniNormilizationHi_NHWC_2x8 = {{
-            0x09999999, // TCfg
-            0x04444444, // ASelt
-            0x4b2a0948, 0x004e2d0c, // ABin
-            0x09999999, // BSelt
-            0x06060606, 0x00060606, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
-            0x3c000000, 0x3c000000, 0x3c000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
        // copy
        gpu_dp_inst_t uniExtractRtoF32_part0_4x4 = {{
            0x01010101, // TCfg
@ -404,79 +352,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
            0x00000001, 0x00000000, 0x00000001, 0x00000000,
            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniRePackRGBLo_2x8 = {{
-            0x00111111, // TCfg
-            0x00001001, // ASelt
-            0x01000400, 0x00000105, // ABin
-            0x00222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001,
-            0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniRePackRGBHi_2x8 = {{
-            0x00111111, // TCfg
-            0x00001001, // ASelt
-            0x03020602, 0x00000307, // ABin
-            0x00222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001,
-            0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniRePackRGBLo_NHWC_2x8 = {{
-            0x00111111, // TCfg
-            0x00100100, // ASelt
-            0x01000400, 0x00000105, // ABin
-            0x00222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001,
-            0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniRePackRGBHi_NHWC_2x8 = {{
-            0x00111111, // TCfg
-            0x00100100, // ASelt
-            0x03020602, 0x00000307, // ABin
-            0x00222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001,
-            0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };

-        switch( pack_key )
+        switch ( pack_key )
        {
-        case _PACK_SELECT_KEY( 1, 0, 1):  // copy         trans
-            {
-                shaderParam.global_scale[0]  = 15;
-                shaderParam.global_scale[1]  = 1;
-                shaderParam.global_scale[2]  = 1;
-                shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
-                    / shaderParam.global_scale[0], 4);
-                shaderParam.global_size[1]   = height;
-                shaderParam.global_size[2]   = 1;
-
-                status = vsi_nn_kernel_gpu_add_param(node, "uniNormilizationLo_2x8", &uniNormilizationLo_NHWC_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniNormilizationHi_2x8", &uniNormilizationHi_NHWC_2x8);
-                CHECK_STATUS_FAIL_GOTO(status, OnError);
-            }
-            break;
-        case _PACK_SELECT_KEY( 1, 2, 1):  // copy reorder  trans
-            {
-                shaderParam.global_scale[0]  = 15;
-                shaderParam.global_scale[1]  = 1;
-                shaderParam.global_scale[2]  = 1;
-                shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
-                    / shaderParam.global_scale[0], 4);
-                shaderParam.global_size[1]   = height;
-                shaderParam.global_size[2]   = 1;
-
-                status = vsi_nn_kernel_gpu_add_param(node, "uniNormilizationLo_2x8", &uniNormilizationLo_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniNormilizationHi_2x8", &uniNormilizationHi_2x8);
-                CHECK_STATUS_FAIL_GOTO(status, OnError);
-            }
-            break;
        case _PACK_SELECT_KEY( 1, 0, 0):  // copy
        case _PACK_SELECT_KEY( 1, 2, 0):  // copy  reorder
            {
@ -539,68 +417,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
                CHECK_STATUS_FAIL_GOTO(status, OnError);
            }
            break;
-        case _PACK_SELECT_KEY( 0, 0, 1):  //      trans
-            {
-                shaderParam.global_scale[0]  = 4;
-                shaderParam.global_scale[1]  = 1;
-                shaderParam.global_scale[2]  = 1;
-                shaderParam.global_size[0]   = gpu_align_p2((width / 3 + shaderParam.global_scale[0] - 1)
-                    / shaderParam.global_scale[0], 4);
-                shaderParam.global_size[1]   = height;
-                shaderParam.global_size[2]   = 1;
-
-                if(attr[0]->dtype == F16)
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
-                }
-                else
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
-                }
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToR", &uniUnpackToR);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToG", &uniUnpackToG);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToB", &uniUnpackToB);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBLo_2x8", &uniRePackRGBLo_NHWC_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBHi_2x8", &uniRePackRGBHi_NHWC_2x8);
-                CHECK_STATUS_FAIL_GOTO(status, OnError);
-            }
-            break;
-        case _PACK_SELECT_KEY( 0, 2, 1):  //    reorder  trans
-            {
-                shaderParam.global_scale[0]  = 4;
-                shaderParam.global_scale[1]  = 1;
-                shaderParam.global_scale[2]  = 1;
-                shaderParam.global_size[0]   = gpu_align_p2((width / 3 + shaderParam.global_scale[0] - 1)
-                    / shaderParam.global_scale[0], 4);
-                shaderParam.global_size[1]   = height;
-                shaderParam.global_size[2]   = 1;
-
-                if(attr[0]->dtype == F16)
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
-                }
-                else
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
-                }
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToR", &uniUnpackToR);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToG", &uniUnpackToG);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToB", &uniUnpackToB);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBLo_2x8", &uniRePackRGBLo_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBHi_2x8", &uniRePackRGBHi_2x8);
-                CHECK_STATUS_FAIL_GOTO(status, OnError);
-            }
-            break;
        default:
            break;
        }
@ -637,23 +453,14 @@ static vsi_status _query_kernel
    uint32_t key = 0;
    int i = 0;
    vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
-    vsi_bool enable_perm  = vsi_nn_kernel_param_get_int32( params, "enable_perm" );

    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    if(enable_copy && enable_perm)
-    {
-        convert_type = COPY_NHWC;
-    }
-    else if(enable_copy)
+    if (enable_copy)
    {
        convert_type = COPY;
    }
-    else if(enable_perm)
-    {
-        convert_type = SCALE_NHWC;
-    }
    else
    {
        convert_type = SCALE;
@ -661,14 +468,14 @@ static vsi_status _query_kernel

    key = HASH_PRE_PROCESS_RGB_KEY( input0_dtype, output_dtype, convert_type, 0 );

-    for( i = 0; i < _cnt_of_array(pre_process_rgb_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(pre_process_rgb_map); i ++ )
    {
        if( pre_process_rgb_map[i].key == key )
        {
            break;
        }
    }
-    if( i < _cnt_of_array(pre_process_rgb_map) )
+    if ( i < _cnt_of_array(pre_process_rgb_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_rgb_map[i].function_name );
        kernel->info.parameters = vxPreProcessRgbKernel_param_def;
@ -698,21 +505,20 @@ static vsi_nn_kernel_node_t _setup
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_RGB_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
-    int32_t shapes[VSI_NN_MAX_DIM_NUM]  = {1, 1, 1, 1};
    vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
-    int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
+    int32_t trans = 0;

-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
    }

    status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            uint32_t index = 2;
            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@ -726,18 +532,7 @@ static vsi_nn_kernel_node_t _setup
            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );

            /* Pass parameters to node. */
-            if(trans)
-            {
-                shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
-                shapes[1] = outputs[0]->attr.size[2];
-
-                reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
-                    outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
-
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM,
-                    inputs, 1, &reshape_tensors[0], 1 );
-            }
-            else
+            if (trans == 0)
            {
                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM,
                    inputs, 1, outputs, 1 );
@ -767,7 +562,7 @@ static vsi_nn_kernel_node_t _setup
        }
    }

-    if(reshape_tensors[0])
+    if (reshape_tensors[0])
    {
        vsi_nn_ReleaseTensor(&reshape_tensors[0]);
    }
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
@ -43,15 +43,12 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8      CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8")
-#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_trans_U8")
-#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_TRANS_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_trans_U8toU8")

 #define KERNEL_SOURCE_1    "pre_process_yuv420_scale_u8",
 #define KERNEL_SOURCE_2    "pre_process_yuv420_copy_u8",
 #define KERNEL_SOURCE_3    "pre_process_yuv420_scale_fp16",
 #define KERNEL_SOURCE_4    "pre_process_yuv420_scale_i16",
 #define KERNEL_SOURCE_5    "pre_process_yuv420_scale_i8",
-#define KERNEL_SOURCE_6    "pre_process_yuv420_trans_u8"

 typedef enum
 {
@ -80,8 +77,6 @@ static const struct {
    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_1)
    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_5)
    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  COPY_TRANS,   KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  TRANS,        KERNEL_SOURCE_6)
 };

 static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] =
@ -143,24 +138,24 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
    width      = out_shape->data[0];
    height     = out_shape->data[1];

-    if(reorder != 0)
+    if (reorder != 0)
    {
        reorder = 2;
        order1 = 0;
    }

-    if(trans)
+    if (trans)
    {
        width = width / 3;
    }

-    if(attr[0]->dtype == U8)
+    if (attr[0]->dtype == U8)
    {
        dstScale = 1.0f / dstScale;
    }

    shaderParam.global_scale[0]  = 16;
-    if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
    {
        shaderParam.global_scale[0]  = 8;
    }
@ -176,131 +171,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError);

    {
-        gpu_dp_inst_t uniPackBG0_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x01000000, 0x02020001, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackTmpAndR_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03000100, 0x07060104, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackRB0_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x03000302, 0x05040004, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp0AndG_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03030100, 0x07060404, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackGR1_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x06000505, 0x07070006, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp1AndB_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03060100, 0x07060704, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackBG1_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x09000808, 0x0a0a0009, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp1AndR_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03080100, 0x07060904, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackRB2_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x0b000b0a, 0x0d0c000c, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp2AndG_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x030b0100, 0x07060c04, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackGR2_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x0e000d0d, 0x0f0f000e, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp2AndB_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x030e0100, 0x07060f04, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
        gpu_dp_inst_t uniCalculateTmpR1st_4x4 = {{
                0x05050505, // TCfg
                0x04040404, // ASelt
@ -574,19 +444,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
                status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB4th_4x4", &uniCalculateTmpB4th_4x4);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4);

-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG0_2x8", &uniPackBG0_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmpAndR_2x8", &uniPackTmpAndR_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB0_2x8", &uniPackRB0_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp0AndG_2x8", &uniPackTmp0AndG_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR1_2x8", &uniPackGR1_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndB_2x8", &uniPackTmp1AndB_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG1_2x8", &uniPackBG1_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndR_2x8", &uniPackTmp1AndR_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB2_2x8", &uniPackRB2_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndG_2x8", &uniPackTmp2AndG_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR2_2x8", &uniPackGR2_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndB_2x8", &uniPackTmp2AndB_2x8);
-
                status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoB_2x8", &uniQuantU8toU8LoB_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiB_2x8", &uniQuantU8toU8HiB_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoG_2x8", &uniQuantU8toU8LoG_2x8);
@ -633,7 +490,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
    int32_t     dstZP      = 0;
    float       dstScale   = 1;
    int32_t     reorder    = 0;
-    int32_t     trans      = 0;
    int32_t     order1     = 2;
    uint32_t    width      = 0;
    uint32_t    height     = 0;
@ -646,8 +502,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)

    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder);
    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
    dstZP      = attr[0]->asymm.zero_point;
@ -655,17 +509,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
    width      = out_shape->data[0];
    height     = out_shape->data[1];

-    if(reorder != 0)
+    if (reorder != 0)
    {
        reorder = 2;
        order1 = 0;
    }
-    if(trans)
-    {
-        width = width / 3;
-    }

-    if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
        if (attr[0]->dfp.fl > 0)
        {
@ -677,11 +527,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
        }
        dstZP = 0;
    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
-        dstScale = 1.0f/dstScale;
+        dstScale = 1.0f / dstScale;
    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
    {
        dstScale = 1;
        dstZP = 0;
@ -925,26 +775,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16 };

-        //trans
-        gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{
-            0x11311311, // TCfg
-            0x00100100, // ASelt
-            0x01000400, 0x06020105, // ABin
-            0x22022022, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{
-            0x00003113, // TCfg
-            0x00001001, // ASelt
-            0x03070302, 0x00000000, // ABin
-            0x00000220, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
        status = vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4);
        status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8);
        status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU2nd_2x8", &uniCalculateTmpGbyU2nd_2x8);
@ -975,16 +805,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
        status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise_4x4", &uniCalculateGWise_4x4);
        status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise2nd_4x4", &uniCalculateGWise2nd_4x4);

-        if(trans)
-        {
-            status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8);
-            CHECK_STATUS_FAIL_GOTO(status, OnError );
-        }
-        else
-        {
-            status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
-        }
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
        status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
        CHECK_STATUS_FAIL_GOTO(status, OnError );

@ -1041,20 +862,11 @@ static vsi_status _query_kernel
    uint32_t key = 0;
    int i = 0;
    vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
-    vsi_bool enable_perm  = vsi_nn_kernel_param_get_int32( params, "enable_perm" );

    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    if(enable_perm && enable_copy)
-    {
-        convert_type = COPY_TRANS;
-    }
-    else if(enable_perm)
-    {
-        convert_type = TRANS;
-    }
-    else if(enable_copy && output_dtype == U8)
+    if (enable_copy && output_dtype == U8)
    {
        convert_type = COPY;
    }
@ -1065,20 +877,20 @@ static vsi_status _query_kernel

    key = HASH_PRE_PROCESS_YUV420_KEY( input0_dtype, output_dtype, convert_type, 0 );

-    for( i = 0; i < _cnt_of_array(pre_process_yuv420_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(pre_process_yuv420_map); i ++ )
    {
-        if( pre_process_yuv420_map[i].key == key )
+        if ( pre_process_yuv420_map[i].key == key )
        {
            break;
        }
    }
-    if( i < _cnt_of_array(pre_process_yuv420_map) )
+    if ( i < _cnt_of_array(pre_process_yuv420_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_yuv420_map[i].function_name );
        kernel->info.parameters = vxPreProcessYuv420Kernel_param_def;
        kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def );

-        if(enable_copy && output_dtype == U8)
+        if (enable_copy && output_dtype == U8)
        {
            kernel->info.initialize = _pre_process_yuv420_copy_initializer;
        }
@ -1110,21 +922,20 @@ static vsi_nn_kernel_node_t _setup
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV420_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
-    int32_t shapes[VSI_NN_MAX_DIM_NUM]  = {1, 1, 1, 1};
    vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
-    int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
+    int32_t trans = 0;

-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
    }

    status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            uint32_t index = 4;
            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@ -1138,22 +949,10 @@ static vsi_nn_kernel_node_t _setup
            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );

            /* Pass parameters to node. */
-            if(trans)
-            {
-                shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
-                shapes[1] = outputs[0]->attr.size[2];

-                reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
-                    outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
+            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM,
+                inputs, 3, outputs, 1 );

-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM,
-                    inputs, 3, &reshape_tensors[0], 1 );
-            }
-            else
-            {
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM,
-                    inputs, 3, outputs, 1 );
-            }
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
@ -1178,7 +977,7 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &tmp_params[13] );
        }
    }
-    if(reshape_tensors[0])
+    if (reshape_tensors[0])
    {
        vsi_nn_ReleaseTensor(&reshape_tensors[0]);
    }
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
@ -43,11 +43,8 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toU8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOI8     CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toI8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOU8      CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toU8")
-#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_trans_U8")
-#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_TRANS_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv444_trans_U8toU8")

 #define KERNEL_SOURCE_1    "pre_process_yuv444_scale",
-#define KERNEL_SOURCE_2    "pre_process_yuv444_trans_u8",
 #define KERNEL_SOURCE_3    "pre_process_yuv444_scale_fp16",
 #define KERNEL_SOURCE_4    "pre_process_yuv444_copy_u8",

@ -78,8 +75,6 @@ static const struct {
    TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_1)
    TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_1)
    TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_4)
-    TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8,  COPY_TRANS,   KERNEL_SOURCE_4)
-    TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8,  TRANS,        KERNEL_SOURCE_2)
 };

 static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] =
@ -119,7 +114,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
    int32_t     dstZP      = 0;
    float       dstScale   = 1;
    int32_t     reorder    = 0;
-    int32_t     trans      = 0;
    int32_t     order1     = 2;
    uint32_t    width      = 0;
    uint32_t    height     = 0;
@ -132,8 +126,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)

    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder);
    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
    dstZP      = attr[0]->asymm.zero_point;
@ -141,24 +133,19 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
    width      = out_shape->data[0];
    height     = out_shape->data[1];

-    if(reorder != 0)
+    if (reorder != 0)
    {
        reorder = 2;
        order1 = 0;
    }

-    if(trans)
-    {
-        width = width / 3;
-    }
-
-    if(attr[0]->dtype == U8)
+    if (attr[0]->dtype == U8)
    {
        dstScale = 1.0f / dstScale;
    }

    shaderParam.global_scale[0]  = 16;
-    if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
    {
        shaderParam.global_scale[0]  = 8;
    }
@ -174,131 +161,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError);

    {
-        gpu_dp_inst_t uniPackBG0_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x01000000, 0x02020001, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackTmpAndR_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03000100, 0x07060104, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackRB0_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x03000302, 0x05040004, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp0AndG_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03030100, 0x07060404, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackGR1_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x06000505, 0x07070006, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp1AndB_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03060100, 0x07060704, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackBG1_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x09000808, 0x0a0a0009, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp1AndR_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03080100, 0x07060904, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackRB2_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x0b000b0a, 0x0d0c000c, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp2AndG_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x030b0100, 0x07060c04, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackGR2_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x0e000d0d, 0x0f0f000e, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp2AndB_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x030e0100, 0x07060f04, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
        gpu_dp_inst_t uniCalculateTmpR1st_4x4 = {{
                0x05050505, // TCfg
                0x04040404, // ASelt
@ -563,19 +425,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
                status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB4th_4x4", &uniCalculateTmpB4th_4x4);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4);

-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG0_2x8", &uniPackBG0_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmpAndR_2x8", &uniPackTmpAndR_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB0_2x8", &uniPackRB0_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp0AndG_2x8", &uniPackTmp0AndG_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR1_2x8", &uniPackGR1_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndB_2x8", &uniPackTmp1AndB_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG1_2x8", &uniPackBG1_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndR_2x8", &uniPackTmp1AndR_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB2_2x8", &uniPackRB2_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndG_2x8", &uniPackTmp2AndG_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR2_2x8", &uniPackGR2_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndB_2x8", &uniPackTmp2AndB_2x8);
-
                status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoB_2x8", &uniQuantU8toU8LoB_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiB_2x8", &uniQuantU8toU8HiB_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoG_2x8", &uniQuantU8toU8LoG_2x8);
@ -622,7 +471,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
    int32_t     dstZP      = 0;
    float       dstScale   = 1;
    int32_t     reorder    = 0;
-    int32_t     trans      = 0;
    int32_t     order1     = 2;
    uint32_t    width      = 0;
    uint32_t    height     = 0;
@ -635,8 +483,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)

    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder);
    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
    dstZP      = attr[0]->asymm.zero_point;
@ -644,17 +490,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
    width      = out_shape->data[0];
    height     = out_shape->data[1];

-    if(reorder != 0)
+    if (reorder != 0)
    {
        reorder = 2;
        order1 = 0;
    }
-    if(trans)
-    {
-        width = width / 3;
-    }

-    if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
        if (attr[0]->dfp.fl > 0)
        {
@ -666,11 +508,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
        }
        dstZP = 0;
    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
-        dstScale = 1.0f/dstScale;
+        dstScale = 1.0f / dstScale;
    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
    {
        dstScale = 1;
        dstZP = 0;
@ -914,26 +756,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16 };

-        //trans
-        gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{
-            0x11311311, // TCfg
-            0x00100100, // ASelt
-            0x01000400, 0x06020105, // ABin
-            0x22022022, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{
-            0x00003113, // TCfg
-            0x00001001, // ASelt
-            0x03070302, 0x00000000, // ABin
-            0x00000220, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
        status = vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4);
        status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8);
        status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU2nd_2x8", &uniCalculateTmpGbyU2nd_2x8);
@ -963,17 +785,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)

        status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise_4x4", &uniCalculateGWise_4x4);
        status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise2nd_4x4", &uniCalculateGWise2nd_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);

-        if(trans)
-        {
-            status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8);
-            CHECK_STATUS_FAIL_GOTO(status, OnError );
-        }
-        else
-        {
-            status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
-        }
        status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
        CHECK_STATUS_FAIL_GOTO(status, OnError );

@ -1024,20 +837,11 @@ static vsi_status _query_kernel
    uint32_t key = 0;
    int i = 0;
    vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
-    vsi_bool enable_perm  = vsi_nn_kernel_param_get_int32( params, "enable_perm" );

    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    if(enable_perm && enable_copy)
-    {
-        convert_type = COPY_TRANS;
-    }
-    else if(enable_perm)
-    {
-        convert_type = TRANS;
-    }
-    else if(enable_copy && output_dtype == U8)
+    if (enable_copy && output_dtype == U8)
    {
        convert_type = COPY;
    }
@ -1048,20 +852,20 @@ static vsi_status _query_kernel

    key = HASH_PRE_PROCESS_YUV444_KEY( input0_dtype, output_dtype, convert_type, 0 );

-    for( i = 0; i < _cnt_of_array(pre_process_yuv444_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(pre_process_yuv444_map); i ++ )
    {
-        if( pre_process_yuv444_map[i].key == key )
+        if ( pre_process_yuv444_map[i].key == key )
        {
            break;
        }
    }
-    if( i < _cnt_of_array(pre_process_yuv444_map) )
+    if ( i < _cnt_of_array(pre_process_yuv444_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_yuv444_map[i].function_name );
        kernel->info.parameters = vxPreProcessYuv444Kernel_param_def;
        kernel->info.numParams = _cnt_of_array( vxPreProcessYuv444Kernel_param_def );

-        if(enable_copy && output_dtype == U8)
+        if (enable_copy && output_dtype == U8)
        {
            kernel->info.initialize = _pre_process_yuv444_copy_initializer;
        }
@ -1093,21 +897,20 @@ static vsi_nn_kernel_node_t _setup
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV444_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
-    int32_t shapes[VSI_NN_MAX_DIM_NUM]  = {1, 1, 1, 1};
    vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
-    int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
+    int32_t trans = 0;

-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
    }

    status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            uint32_t index = 4;
            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@ -1121,22 +924,9 @@ static vsi_nn_kernel_node_t _setup
            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );

            /* Pass parameters to node. */
-            if(trans)
-            {
-                shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
-                shapes[1] = outputs[0]->attr.size[2];
+            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM,
+                inputs, 3, outputs, 1 );

-                reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
-                    outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
-
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM,
-                    inputs, 3, &reshape_tensors[0], 1 );
-            }
-            else
-            {
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM,
-                    inputs, 3, outputs, 1 );
-            }
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@ -369,6 +369,26 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
            0x00000000, 0x00000000, 0x00000000, 0x00000000,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniRightSubLeft_4x4 = {{
+            0x09090909, // TCfg
+            0x00000000, // ASelt
+            0x00230001, 0x00670045, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniDFPtoFp32_left_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};

        if (I8 == input_dtype && I8 == output_dtype && out_width > in_width)
        {
@ -405,7 +425,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)

            status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
            status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4",
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
                                                 &uniConvertDFP2FP32_part1_4x4);
            status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
            CHECK_STATUS_FAIL_GOTO(status, final );
@ -447,16 +468,22 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)

            status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
            status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4",
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
                                                 &uniConvertDFP2FP32_part1_4x4);
            status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
            CHECK_STATUS_FAIL_GOTO(status, final );

            gpu_param.global_scale[2] = depth;
        }
+        else
+        {
+            status  = vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniDFPtoFp32_left_4x4);
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }

-        status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_4x4", &uniConvertDFP2FP32_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
        status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
        status |= vsi_nn_kernel_gpu_add_param( node, "dfpScale", &dfpScale);
        CHECK_STATUS_FAIL_GOTO(status, final );
@ -485,10 +512,33 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
            0x00010001, 0x00000000, 0x00010001, 0x00000000,
            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniU8SubZPtoFp32_left_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{
+            0x09090909, // TCfg
+            0x00000000, // ASelt
+            0x00230001, 0x00670045, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+

        if (F16 == output_dtype)
        {
-            status = vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &input_scale);
+            status  = vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &input_scale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4);
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        else
@ -544,13 +594,21 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
                }
                else
                {
+                    status  = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_4x4);
                    status |= vsi_nn_kernel_gpu_add_param( node,
-                              "uniU8SubZPtoFp32_part1_4x4", &uniU8SubZPtoFp32_part1_4x4);
+                              "uniU8RightSubLeft_4x4", &uniU8SubZPtoFp32_part1_4x4);
                }
                CHECK_STATUS_FAIL_GOTO(status, final );

                gpu_param.global_scale[2] = depth;
            }
+            else if (!is_use_scale_kernel)
+            {
+                status = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4);
+                status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4);
+                CHECK_STATUS_FAIL_GOTO(status, final );
+            }
+
            if (!is_use_scale_kernel)
            {
                status  = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
@ -562,8 +620,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        status  = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
        if (!is_use_scale_kernel)
        {
-            status  = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_4x4", &uniU8SubZPtoFp32_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "input_ZP", &inputZP);
+            status = vsi_nn_kernel_gpu_add_param( node, "input_ZP", &inputZP);
        }
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
@ -581,25 +638,25 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
            0x00000000, 0x00000000, 0x00000000, 0x00000000,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniFp16toFp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniRightSubLeft_4x4 = {{
            0x09090909, // TCfg
-            0x04040404, // ASelt
-            0x00110000, 0x00330022, // ABin
+            0x00000000, // ASelt
+            0x00230001, 0x00670045, // ABin
            0x0a0a0a0a, // BSelt
            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000,
-            0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniFp16toFp32_left_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniExtactHalf8_2x8 = {{
            0x11111111, // TCfg
@ -634,7 +691,17 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16};
-            gpu_dp_inst_t uniFp16toFp32_part1_4x4 = {{
+            gpu_dp_inst_t uniFp16toFp32_Lo_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniFp16toFp32_Hi_4x4 = {{
                0x09090909, // TCfg
                0x00000000, // ASelt
                0x00150004, 0x00370026, // ABin
@ -647,7 +714,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)

            status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
            status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_Lo_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniFp16toFp32_Hi_4x4);
            status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8);
            status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
            CHECK_STATUS_FAIL_GOTO(status, final );
@ -657,19 +725,21 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        else if (F16 == output_dtype)
        {
            status = vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_left_4x4);
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        else
        {
            status  = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_left_4x4);
            status |= vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &uint8Scale);
            status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &uint8ZP_out);
            CHECK_STATUS_FAIL_GOTO(status, final );
        }

-        status  = vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
+        status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
    else if (BF16 == input_dtype && BF16 == output_dtype)
--- a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
@ -0,0 +1,366 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_U8TOU8        CVIVANTE_NAMESPACE("evis.space2depth_internal_U8toU8")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_U8TOU8_X2Y1   CVIVANTE_NAMESPACE("evis.space2depth_internal_U8toU8_X2Y1")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I8TOI8        CVIVANTE_NAMESPACE("evis.space2depth_internal_I8toI8")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I8TOI8_X2Y1   CVIVANTE_NAMESPACE("evis.space2depth_internal_I8toI8_X2Y1")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I16TOI16      CVIVANTE_NAMESPACE("evis.space2depth_internal_I16toI16")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I16TOI16_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_I16toI16_X2Y1")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_F16TOF16      CVIVANTE_NAMESPACE("evis.space2depth_internal_F16toF16")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_F16TOF16_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_F16toF16_X2Y1")
+
+#define KERNEL_SOURCE_1    "space2depth_internal"
+
+// Add kernel hashtable here
+#define HASH_SPACE2DEPTH_INTERNAL_KEY(_input0_type, _output_type, _opt_stride) \
+    ((_input0_type << 24) | (_output_type << 16) | (_opt_stride << 8))
+
+#define TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
+#define TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_##IN0_TYPE##TO##OUT_TYPE##_X2Y1, \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } space2depth_internal_map[] =
+{
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(U8,  U8,        KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(I8,  I8,        KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(I16, I16,       KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(F16, F16,       KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(U8,  U8,    KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(I8,  I8,    KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(I16, I16,   KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(F16, F16,   KERNEL_SOURCE_1)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _space2depth_internal_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _SPACE2DEPTH_INTERNAL_PARAM_NUM  _cnt_of_array( _space2depth_internal_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    uint32_t    input_dims = 0;
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    int32_t     input_width  = 0;
+    int32_t     input_height = 0;
+    int32_t     input_depth  = 0;
+    int32_t     stride_x   = 0;
+    int32_t     stride_y   = 0;
+    int32_t     opt_flg    = 0;
+
+    uint32_t pack_key = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &stride_x);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &stride_y);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    input_dims = (uint32_t)attr[0]->shape->size;
+    input_width = attr[0]->shape->data[0];
+    input_height = attr[0]->shape->data[1];
+    input_depth = input_dims > 2 ? attr[0]->shape->data[2] : 1;
+
+    shaderParam.global_scale[0]  = 1;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    if (stride_x == 2 && stride_y == 1)
+    {
+        shaderParam.global_scale[0]  = 16;
+        if (attr[0]->dtype == F16 || attr[0]->dtype == I16)
+        {
+            shaderParam.global_scale[0]  = 8;
+        }
+        opt_flg = 1;
+    }
+    shaderParam.global_size[0]   = gpu_align_p2((input_width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = input_height;
+    shaderParam.global_size[2]   = input_depth;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, OPT_FLG )    \
+        (IN0_TYPE | (OUT_TYPE << 8) | (OPT_FLG << 16))
+
+    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, opt_flg);
+
+    {
+        gpu_dp_inst_t uniExtractEvenUint8Stride2_2x8 = {{
+            0x11111111, // TCfg
+            0x00000000, // ASelt
+            0x06040200, 0x0e0c0a08, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000700, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddUint8Stride2_2x8 = {{
+            0x11111111, // TCfg
+            0x00000000, // ASelt
+            0x07050301, 0x0f0d0b09, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000700, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniExtractEvenFp16Stride2_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddFp16Stride2_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00030001, 0x00070005, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param(node, "input_depth", &input_depth);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        switch( pack_key )
+        {
+        case _PACK_SELECT_KEY( U8,  U8,  0 ):
+        case _PACK_SELECT_KEY( I8,  I8,  0 ):
+        case _PACK_SELECT_KEY( I16, I16, 0 ):
+        case _PACK_SELECT_KEY( F16, F16, 0 ):
+            break;
+        case _PACK_SELECT_KEY( U8,  U8,  1 ):
+        case _PACK_SELECT_KEY( I8,  I8,  1 ):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractEvenUint8Stride2_2x8", &uniExtractEvenUint8Stride2_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractOddUint8Stride2_2x8", &uniExtractOddUint8Stride2_2x8 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( I16, I16, 1 ):
+        case _PACK_SELECT_KEY( F16, F16, 1 ):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractEvenFp16Stride2_4x4", &uniExtractEvenFp16Stride2_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractOddFp16Stride2_4x4", &uniExtractOddFp16Stride2_4x4 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+#undef _PACK_SELECT_KEY
+
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+}
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    const vsi_nn_kernel_param_t * params,
+    int32_t opt_flg
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_SPACE2DEPTH_INTERNAL_KEY( input0_dtype, output_dtype, opt_flg );
+
+    for( i = 0; i < _cnt_of_array(space2depth_internal_map); i ++ )
+    {
+        if ( space2depth_internal_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(space2depth_internal_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  space2depth_internal_map[i].function_name );
+        kernel->info.parameters = _space2depth_internal_kernel_param_def;
+        kernel->info.numParams = _SPACE2DEPTH_INTERNAL_PARAM_NUM;
+        kernel->info.initialize = _space2depth_internal_initializer;
+
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                space2depth_internal_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                space2depth_internal_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t tmp_params[_SPACE2DEPTH_INTERNAL_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t block_size_x  = vsi_nn_kernel_param_get_int32( params, "block_size_x" );
+    int32_t block_size_y  = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
+    int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, params, opt_flg );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            vsi_nn_kernel_node_pack_io( tmp_params, _SPACE2DEPTH_INTERNAL_PARAM_NUM, inputs, 1, outputs, 1 );
+            tmp_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_x );
+            tmp_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_y );
+            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _SPACE2DEPTH_INTERNAL_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &tmp_params[2] );
+            vsi_nn_kernel_scalar_release( &tmp_params[3] );
+            {
+                // Set default border mode.
+                vx_border_t border;
+                border.mode = VX_BORDER_CONSTANT;
+                border.constant_value.U8 = 0;
+                border.constant_value.U16 = 0;
+                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+                {
+                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                }
+                status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+                CHECK_STATUS(status);
+            }
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( space2depth_internal, _setup )
+
--- a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
@ -0,0 +1,422 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    UP_ORG = 0,
+    UP_K2,
+} _internal_upscale_e;
+
+#define _UPSAMPLESCALE_KERNEL_SOURCE      "upsamplescale"
+#define _UPSAMPLESCALE_KERNEL_K2_SOURCE   "upsamplescale_k2"
+#define _UPSAMPLESCALE_KERNEL_NAME        CVIVANTE_NAMESPACE("evis.upsamplescale")
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, FLAG ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8) | ( FLAG << 16))
+
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_ORG ), \
+          CVIVANTE_NAMESPACE("evis.upsamplescale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          _UPSAMPLESCALE_KERNEL_SOURCE }
+
+#define PACK_KERNEL_MAP_K2( IN_DTYPE, OUT_DTYPE ) \
+        { UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_K2 ), \
+          CVIVANTE_NAMESPACE("evis.upsamplescale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_K2"), \
+          _UPSAMPLESCALE_KERNEL_K2_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _upsamplescale_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( F16, F16 ),
+    PACK_KERNEL_MAP( F16, I16 ),
+    PACK_KERNEL_MAP( F16, I8 ),
+    PACK_KERNEL_MAP( F16, U8 ),
+    PACK_KERNEL_MAP( I16, I16 ),
+    PACK_KERNEL_MAP( I16, F16 ),
+    PACK_KERNEL_MAP( I8,  I8 ),
+    PACK_KERNEL_MAP( I8,  F16 ),
+    PACK_KERNEL_MAP( U8,  U8 ),
+    PACK_KERNEL_MAP( U8,  F16 ),
+
+    PACK_KERNEL_MAP_K2( F16, F16 ),
+    PACK_KERNEL_MAP_K2( F16, I16 ),
+    PACK_KERNEL_MAP_K2( F16, I8 ),
+    PACK_KERNEL_MAP_K2( F16, U8 ),
+    PACK_KERNEL_MAP_K2( I16, I16 ),
+    PACK_KERNEL_MAP_K2( I16, F16 ),
+    PACK_KERNEL_MAP_K2( I8,  I8 ),
+    PACK_KERNEL_MAP_K2( I8,  F16 ),
+    PACK_KERNEL_MAP_K2( U8,  U8 ),
+    PACK_KERNEL_MAP_K2( U8,  F16 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _upsamplescale_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _UPSAMPLESCALE_PARAM_NUM  _cnt_of_array( _upsamplescale_kernel_param_def )
+#define SCALAR_STRIDE_VALUE          (2)
+#define SCALAR_SCALE_VALUE           (3)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_upsamplescale_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+#define _PACK_UPSCALE_KEY( IN_TYPE, OUT_TYPE, FLAG )    \
+        ( IN_TYPE  | ( OUT_TYPE << 16) | (FLAG << 24) )
+
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
+    vsi_int_array_t * in_shape                  = NULL;
+    vsi_nn_kernel_dtype_e        input_dtype    = F16;
+    vsi_nn_kernel_dtype_e        output_dtype   = F16;
+    int32_t   stride          = 0;
+    float     scale           = 0;
+    float     scaleIn         = 1.0f;
+    float     scaleOut        = 1.0f;
+    int32_t   output_ZP       = 0;
+    int32_t   input_ZP        = 0;
+    int32_t   srcFixPointPos  = 0;
+    int32_t   dstFixPointPos  = 0;
+    uint32_t  pack_key        = 0;
+    _internal_upscale_e flag  = UP_ORG;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    in_shape  = input_attr->shape;
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_VALUE], &(stride));
+    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_VALUE], &(scale));
+    input_dtype  = input_attr->dtype;
+    output_dtype = output_attr->dtype;
+
+    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
+    {
+        srcFixPointPos   = input_attr->dfp.fl;
+        if (srcFixPointPos >=0 )
+            scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
+        else
+            scaleIn = (float) ((int64_t)1 << -srcFixPointPos);
+    }
+    else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
+    {
+        input_ZP         = input_attr->asymm.zero_point;
+        scaleIn          = input_attr->asymm.scale;
+    }
+
+    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
+    {
+        dstFixPointPos   = output_attr->dfp.fl;
+        if (dstFixPointPos >=0 )
+            scaleOut = 1.0f / (float) ((int64_t)1 << dstFixPointPos);
+        else
+            scaleOut = (float) ((int64_t)1 << -dstFixPointPos);
+    }
+    else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
+    {
+        output_ZP        = output_attr->asymm.zero_point;
+        scaleOut         = output_attr->asymm.scale;
+    }
+
+    if (stride == 2 && scale >= 0)
+    {
+        flag = UP_K2;
+    }
+
+    if ( flag == UP_K2 )
+    {
+        gpu_param.global_scale[0] = 8;
+        gpu_param.global_scale[1] = 1;
+        gpu_param.global_scale[2] = 1;
+    }
+    else
+    {
+        gpu_param.global_scale[0] = 1;
+        gpu_param.global_scale[1] = 1;
+        gpu_param.global_scale[2] = 1;
+    }
+
+    gpu_param.global_size[0] = gpu_align_p2(
+            (in_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (in_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = in_shape->size > 2 ? in_shape->data[2] : 1;
+
+    pack_key = _PACK_UPSCALE_KEY( input_dtype, output_dtype, flag );
+
+    switch( pack_key )
+    {
+        case _PACK_UPSCALE_KEY( F16, F16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( F16, I16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( F16, I8,  UP_K2 ):
+        case _PACK_UPSCALE_KEY( F16, U8,  UP_K2 ):
+        case _PACK_UPSCALE_KEY( I16, F16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( I16, I16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( I8,  F16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( I8,  I8,  UP_K2 ):
+        case _PACK_UPSCALE_KEY( U8,  F16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( U8,  U8,  UP_K2 ):
+        {
+            uint16_t multiplier         = 0;
+            int32_t  postShift          = 0;
+            uint32_t multAndoutZP[2]    = {0};
+            gpu_dp_inst_t uniUpSampleScale2X_lo_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x11111010, 0x13131212, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniUpSampleScale2X_hi_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x15151414, 0x17171616, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            gpu_quantize_multiplier_16bit(scaleIn * scale / scaleOut, &multiplier, &postShift);
+            multAndoutZP[0] = (uint32_t)(multiplier);
+            multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * multiplier);
+
+            uniUpSampleScale2X_lo_2x8.data[7] |= (postShift & 0x1F);
+            uniUpSampleScale2X_hi_2x8.data[7] |= (postShift & 0x1F);
+
+            status  = vsi_nn_kernel_gpu_add_param( node, "uniUpScale2X_lo_2x8", &uniUpSampleScale2X_lo_2x8);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniUpScale2X_hi_2x8", &uniUpSampleScale2X_hi_2x8);
+            status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP);
+        }
+        break;
+        case _PACK_UPSCALE_KEY( F16, F16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( F16, I16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( F16, I8,  UP_ORG ):
+        case _PACK_UPSCALE_KEY( F16, U8,  UP_ORG ):
+        case _PACK_UPSCALE_KEY( I16, F16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( I16, I16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( I8,  F16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( I8,  I8,  UP_ORG ):
+        case _PACK_UPSCALE_KEY( U8,  F16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( U8,  U8,  UP_ORG ):
+        {
+            float output_scale = scaleIn * scale / scaleOut;
+            float tail = output_ZP - input_ZP * output_scale;
+            gpu_dp_inst_t uniConvertDatatoF32_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertDatatoF32_4x4", &uniConvertDatatoF32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "tail", &tail);
+        }
+        break;
+        default:
+            break;
+    }
+
+#undef _PACK_UPSCALE_KEY
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (input_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release( &input_attr );
+        input_attr = NULL;
+    }
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release( &output_attr );
+        output_attr = NULL;
+    }
+
+    return status;
+} /* _upsamplescale_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t stride,
+    float   scale
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _upsamplescale_kernel_map;
+    vx_param_description_t * param_def  = _upsamplescale_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _upsamplescale_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _upsamplescale_initializer;
+    _internal_upscale_e flag = (stride == 2 && scale >= 0 ) ? UP_K2 : UP_ORG;
+
+    uint32_t key = 0;
+    int i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = UPSAMPLESCALE_HASH_KEY( in_dtype, out_dtype, flag );
+
+    for( i = 0; i < _cnt_of_array( _upsamplescale_kernel_map ); i ++ )
+    {
+        if( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if( i < _cnt_of_array( _upsamplescale_kernel_map ) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_UPSAMPLESCALE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" );
+    float   scale  = vsi_nn_kernel_param_get_float32( params, "scale" );
+
+    status = _query_kernel( kernel, inputs, outputs, stride, scale );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLESCALE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_STRIDE_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &stride );
+            node_params[SCALAR_SCALE_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLESCALE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_VALUE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] );
+            VSI_ASSERT( status == VSI_SUCCESS );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( upsamplescale, _setup )
+
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
@ -24,6 +24,7 @@

 #include <stdint.h>
 #include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_math.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
@ -53,7 +54,7 @@ static vsi_bool compute_gpu_divisor
    int32_t i = 0;
    for( i = vsi_nn_min( input_value, limit - 1 ); i > 0; i -- )
    {
-        if( ( i % gcd == 0 ) && ( input_value % i == 0 ) )
+        if ( ( i % gcd == 0 ) && ( input_value % i == 0 ) )
        {
            *divisor = i;
            return TRUE;
@ -75,7 +76,7 @@ static size_t element_fill_dim
    if (size_x == 1)
        return 0;

-    if( size_x < GPU_TENSOR_MAX_WIDTH)
+    if ( size_x < GPU_TENSOR_MAX_WIDTH)
    {
        shape_x[rank_x] = size_x;
    }
@ -85,7 +86,7 @@ static size_t element_fill_dim
        int32_t remainder = 0;
        compute_gpu_divisor( size_x, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
        remainder = size_x / divisor;
-        if( remainder > GPU_TENSOR_MAX_WIDTH || rank_x >= max_rank)
+        if ( remainder > GPU_TENSOR_MAX_WIDTH || rank_x >= max_rank)
        {
            // Cannot optimize.
            shape_x[rank_x] = size_x;
@ -97,7 +98,7 @@ static size_t element_fill_dim
             * so it should be always 2.
             */
            cost_size = 2;
-            if( size_x > 1 )
+            if ( size_x > 1 )
            {
                shape_x[rank_x]  = divisor;
                shape_x[rank_x + 1] = remainder;
@ -170,25 +171,25 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape
    rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize);
    rank_out += element_fill_dim(out_shape_output, rank_out, GPU_TENSOR_MAX_WIDTH, outerSize);

-    if( 0 == rank_in )
+    if ( 0 == rank_in )
    {
        out_shape_x[0] = 1;
        out_shape_x[1] = 1;
        rank_in = 2;
    }
-    else if( 1 == rank_in )
+    else if ( 1 == rank_in )
    {
        out_shape_x[1] = 1;
        rank_in = 2;
    }

-    if( 0 == rank_out )
+    if ( 0 == rank_out )
    {
        out_shape_output[0] = 1;
        out_shape_output[1] = 1;
        rank_out = 2;
    }
-    else if( 1 == rank_out )
+    else if ( 1 == rank_out )
    {
        out_shape_output[1] = 1;
        rank_out = 2;
@ -200,6 +201,75 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape
    return ret;
 } /* vsi_nn_kernel_optimize_reduce_shape() */

+vsi_bool vsi_nn_kernel_optimize_tensor_shape
+    (
+    const int32_t* shape_x, const size_t rank_x,
+    const int32_t *axis, const size_t axis_size,
+    int32_t* out_shape_x, uint32_t* out_rank_x,
+    int32_t* out_axis, uint32_t* out_axis_size
+    )
+{
+    vsi_bool ret                        = TRUE;
+    size_t   i                          = 0;
+    size_t   rank_in                    = 0;
+    size_t   dims                       = 0;
+    int32_t  innerSize                  = 1;
+    int32_t  outerSize                  = 1;
+    int32_t  axisSize                   = 1;
+
+    for (i = 0; i < axis_size; i++)
+    {
+        axisSize *= shape_x[axis[i]];
+    }
+
+    for (i = 0; i < (size_t)axis[0]; i++)
+    {
+        innerSize *= shape_x[i];
+    }
+
+    for (i = axis[axis_size - 1] + 1; i < rank_x; i++)
+    {
+        outerSize *= shape_x[i];
+    }
+
+    rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, innerSize);
+    dims = element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, axisSize);
+    if (dims == 0)
+    {
+        out_axis[0] = (int32_t)rank_in;
+        *out_axis_size = 1;
+        out_shape_x[rank_in ++] = 1;
+    }
+    else
+    {
+        *out_axis_size = (uint32_t)dims;
+        for (i = 0; i < dims; i++)
+        {
+            out_axis[i] = (int32_t)rank_in + (int32_t)i;
+        }
+    }
+
+    rank_in += dims;
+
+    rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize);
+
+    if ( 0 == rank_in )
+    {
+        out_shape_x[0] = 1;
+        out_shape_x[1] = 1;
+        rank_in = 2;
+    }
+    else if ( 1 == rank_in )
+    {
+        out_shape_x[1] = 1;
+        rank_in = 2;
+    }
+
+    *out_rank_x = (uint32_t)rank_in;
+
+    return ret;
+} /* vsi_nn_kernel_optimize_reduce_shape() */
+
 vsi_bool vsi_nn_kernel_optimize_element_shape
    (
    const int32_t* shape_x, const size_t rank_x,
@ -218,13 +288,13 @@ vsi_bool vsi_nn_kernel_optimize_element_shape

    rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, element_num);

-    if( 0 == rank_in )
+    if ( 0 == rank_in )
    {
        out_shape_x[0] = 1;
        out_shape_x[1] = 1;
        rank_in = 2;
    }
-    else if( 1 == rank_in )
+    else if ( 1 == rank_in )
    {
        out_shape_x[1] = 1;
        rank_in = 2;
@ -275,13 +345,13 @@ vsi_bool vsi_nn_kernel_optimize_softmax_shape

    rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize);

-    if( 0 == rank_in )
+    if ( 0 == rank_in )
    {
        out_shape_x[0] = 1;
        out_shape_x[1] = 1;
        rank_in = 2;
    }
-    else if( 1 == rank_in )
+    else if ( 1 == rank_in )
    {
        out_shape_x[1] = 1;
        rank_in = 2;
@ -313,7 +383,7 @@ static size_t tile_fill_dim
    size_t cost_size = 1;
    VSI_ASSERT( rank <= max_rank );
    VSI_ASSERT( size_output >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) );
-    if( size_output < GPU_TENSOR_MAX_WIDTH )
+    if ( size_output < GPU_TENSOR_MAX_WIDTH )
    {
        shape_x[rank] = size_x;
        shape_y[rank] = size_y;
@ -325,7 +395,7 @@ static size_t tile_fill_dim
        int32_t remainder = 0;
        compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
        remainder = size_output / divisor;
-        if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
+        if ( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
        {
            // Cannot optimize.
            shape_x[rank] = size_x;
@ -339,7 +409,7 @@ static size_t tile_fill_dim
             * so it should be always 2.
             */
            cost_size = 2;
-            if( size_x > 1 )
+            if ( size_x > 1 )
            {
                shape_x[rank]  = divisor;
                shape_x[rank + 1] = remainder;
@ -349,7 +419,7 @@ static size_t tile_fill_dim
                shape_x[rank] = 1;
                shape_x[rank + 1] = 1;
            }
-            if( size_y > 1 )
+            if ( size_y > 1 )
            {
                shape_y[rank]  = divisor;
                shape_y[rank + 1] = remainder;
@ -401,20 +471,20 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
        sz = shape_output[i];
        /*
         * Skip dim if the size is equal to 1
-         * Also skip if( sx == 1 && sy == 1 )
+         * Also skip if ( sx == 1 && sy == 1 )
         */
-        if( shape_output[i] == 1 )
+        if ( shape_output[i] == 1 )
        {
            continue;
        }

        // Update state
        state = TILE_STATE_EMPTY;
-        if( sx == sz )
+        if ( sx == sz )
        {
            state = TILE_STATE_NO_AXIS;
        }
-        else if( sx != sz )
+        else if ( sx != sz )
        {
            state = TILE_STATE_AXIS_X;
        }
@ -472,16 +542,16 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
                break;
        }
 #undef _pack_state
-        if( append_dim )
+        if ( append_dim )
        {
            dims += tile_fill_dim( out_shape_x, out_shape_y, out_shape_output,
                    dims, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
        }
    }
-    if( ret )
+    if ( ret )
    {
        /* Append the last dim */
-        if( i == rank_output )
+        if ( i == rank_output )
        {
            sx = effective_size_x;
            sy = effective_size_y;
@ -490,7 +560,7 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
                    dims, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
        }
        /* Avoid 1D shape*/
-        if( 1 == dims )
+        if ( 1 == dims )
        {
            out_shape_x[1] = 1;
            out_shape_y[1] = 1;
@ -508,3 +578,39 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
 #undef _swap_size
    return ret;
 } /* vsi_nn_kernel_optimize_eltwise_shape() */
+
+vsi_bool vsi_nn_kernel_optimize_1d_tensor_shape
+    (
+    const int32_t* shape, const uint32_t rank,
+    int32_t* out_shape, uint32_t* out_rank
+    )
+{
+    memcpy(out_shape, shape, sizeof(int32_t) * rank);
+    *out_rank = vsi_nn_max(rank, 2);
+
+    out_shape[1] = rank == 1 ? 1 : out_shape[1];
+
+    return TRUE;
+}
+
+vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
+    (
+    const int32_t* shape, const uint32_t rank,
+    int32_t* out_shape, uint32_t* out_rank
+    )
+{
+    uint32_t dim_num = 0;
+    uint32_t i = 0;
+
+    vsi_nn_kernel_optimize_1d_tensor_shape( shape,
+        rank, out_shape, &dim_num);
+
+    for (i = 3; i < dim_num; i++)
+    {
+        out_shape[2] *= out_shape[i];
+    }
+
+    *out_rank = vsi_nn_min(dim_num, 3);
+
+    return TRUE;
+}
--- a/src/tim/vx/internal/src/kernel/vx/clip_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/clip_vx.c
@ -131,10 +131,8 @@ static vsi_nn_kernel_node_t _setup
    float index[1024] = {0};
    float value[1024] = {0};

-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
-         inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32  ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
+         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
    {
        return NULL;
    }
--- a/src/tim/vx/internal/src/kernel/vx/convolutional.c
+++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c
@ -255,7 +255,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
    vx_node node = NULL;
    vx_nn_convolution_params_ext2_t vxparam;
    vx_tensor temp_tensors[3] = { NULL };
-    int i;
+    int32_t i;
    vsi_bool need_explicit_padding = FALSE;

    _build_vx_conv2d_param(
@ -277,8 +277,17 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )

    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
    {
-        temp_tensors[1] = _expand_tensor_dim( inputs[1]->t,
-                (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
+        int32_t new_w_shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+        uint32_t new_w_rank = 4;
+        new_w_shape[0] = 1;
+        new_w_shape[1] = inputs[1]->attr.size[0];
+        new_w_shape[2] = 1;
+        for (i = 1; i < (int32_t)(inputs[1]->attr.dim_num); i++)
+        {
+            new_w_shape[2] *= inputs[1]->attr.size[i];
+        }
+        new_w_shape[3] = 1;
+        temp_tensors[1] = vxReshapeTensor( inputs[1]->t, new_w_shape, new_w_rank );
        CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final );
    }
    else
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@ -165,10 +165,8 @@ static vsi_nn_kernel_node_t _setup
    float index[1024] = {0};
    float value[1024] = {0};

-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
-         inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32  ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
+         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32  )
    {
        return NULL;
    }
--- a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c
@ -135,10 +135,8 @@ static vsi_nn_kernel_node_t _setup
    float index[1024] = {0};
    float value[1024] = {0};

-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
-         inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32  ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
+         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
    {
        return NULL;
    }
--- a/src/tim/vx/internal/src/libnnext/ops/cl/layer_normalization.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/layer_normalization.cl
@ -0,0 +1,143 @@
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_F32toF32(
+    __read_only image2d_array_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __write_only image2d_array_t  output,
+    float eps,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float e2InScale,
+    float scale_inOut,
+    float sumZpScale,
+    float zp2ScaleE2,
+    float sumZpScaleE2,
+    int width,
+    int height,
+    float dim_ratio
+    )
+{
+    int lidx = get_local_id(0);
+    int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0);
+
+    float4 data, dst;
+    float2 sumSqr = (float2)(0);
+    float scale_vari, bias_val;
+    __local float2 local_sum[16];
+
+    for(; coord.x < width;)
+    {
+        data = read_imagef(input, coord);
+        coord.x += 16;
+        sumSqr.x += data.x;
+        sumSqr.y += data.x * data.x;
+    }
+    local_sum[lidx] = sumSqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lidx == 0)
+    {
+        for(int i = 1; i < 16; i++)
+        {
+            sumSqr += local_sum[i];
+        }
+        local_sum[0] = sumSqr;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sumSqr = local_sum[0] * dim_ratio;
+    sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps;
+    sumSqr.s1 = rsqrt(sumSqr.s1);
+
+    for(coord.x = lidx; coord.x < width;)
+    {
+        float4 gamma = read_imagef(scale, coord.xw);
+        float4 beta  = read_imagef(bias, coord.xw);
+        data = read_imagef(input, coord);
+
+        scale_vari = gamma.s0 * sumSqr.s1;
+        bias_val = (beta.s0 - scale_vari * sumSqr.s0);
+
+        dst.x = data.x * scale_vari + bias_val;
+        write_imagef(output, coord, dst);
+        coord.x += 16;
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_U8toU8(
+    __read_only image2d_array_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __write_only image2d_array_t  output,
+    float eps,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float e2InScale,
+    float scale_inOut,
+    float sumZpScale,
+    float zp2ScaleE2,
+    float sumZpScaleE2,
+    int width,
+    int height,
+    float dim_ratio
+    )
+{
+    int lidx = get_local_id(0);
+    int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0);
+
+    uint4 data, dst;
+    float2 sumSqr;
+    uint tmpSum = 0, tmpSqr = 0;
+    float scale_vari, bias_val;
+    __local uint local_sum[1];
+    __local uint local_sqr[1];
+
+    if(lidx == 0)
+    {
+        local_sum[0] = 0;
+        local_sqr[0] = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(; coord.x < width;)
+    {
+        data = read_imageui(input, coord);
+        coord.x+=16;
+        tmpSum += data.x;
+        tmpSqr += data.x * data.x;
+    }
+    atom_add(local_sum, tmpSum);
+    atom_add(local_sqr, tmpSqr);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    tmpSum = local_sum[0];
+    tmpSqr = local_sqr[0];
+    //sumSqr.x = ((float)tmpSum - width * input_zp) * input_scale;
+    //sumSqr.y = ((float)tmpSqr - 2 * input_zp * (float)tmpSum + width * input_zp * input_zp) * e2InScale;
+    sumSqr.x = (float)tmpSum * input_scale - sumZpScale;
+    sumSqr.y = (float)tmpSqr * e2InScale - zp2ScaleE2 * (float)tmpSum + sumZpScaleE2;
+
+    sumSqr *= dim_ratio;
+    sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps;
+    sumSqr.s1 = rsqrt(sumSqr.s1);
+
+    for(coord.x = lidx; coord.x < width;)
+    {
+        float4 gamma = read_imagef(scale, coord.xw);
+        float4 beta  = read_imagef(bias, coord.xw);
+        data = read_imageui(input, coord);
+
+        scale_vari = gamma.s0 * sumSqr.s1;
+        float alpha = scale_inOut * scale_vari;
+        bias_val = (beta.s0 - scale_vari * sumSqr.s0) * output_scale + output_zp;
+
+        float tmpVal = data.x - input_zp;
+
+        float4 norm;
+        norm.x = tmpVal * alpha + bias_val;
+        dst = convert_uint4_rte(norm);
+        write_imageui(output, coord, dst);
+        coord.x+=16;
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl
@ -6,32 +6,30 @@ __kernel void gemm_F32F32toF32_2D(
    int K,
    int N,
    int ac2zero,
-    int bc2zero
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
    )
 {
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    int2 coord_a = (int2)(0, gidy);
-    int2 coord_b = (int2)(gidx, 0);
-
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
    float4 sum = (float4)(0);

-    for(; coord_a.x < K;)
+    for(; coord.z < K;)
    {
        float4 tempA0;
        float4 tempB0;

-        tempA0 = read_imagef(inputA, coord_a);
-        tempB0 = read_imagef(inputB, coord_b);
-        coord_a.x++;
-        coord_b.y++;
+        tempA0 = read_imagef(inputA, coord.zy);
+        tempB0 = read_imagef(inputB, coord.xz);
+        coord.z++;

-        sum += tempA0 * tempB0;
+        sum = sum + tempA0 * tempB0;
    }
-
-    coord_b.y = gidy;
-    write_imagef(output, coord_b, sum);
+    write_imagef(output, coord.xy, sum);
 }

 __kernel void gemm_F32F32toF32_3D(
@ -42,7 +40,13 @@ __kernel void gemm_F32F32toF32_3D(
    int K,
    int N,
    int ac2zero,
-    int bc2zero
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
    )
 {
    int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);
@ -60,10 +64,160 @@ __kernel void gemm_F32F32toF32_3D(
        coord_a.x++;
        coord_b.y++;

-        sum += tempA0 * tempB0;
+        sum = sum + tempA0 * tempB0;
    }

    coord_b.y = get_global_id(1);
    coord_b.z = get_global_id(2);
    write_imagef(output, coord_b, sum);
 }
+
+__kernel void gemm_transb_F32F32toF32_2D(
+    __read_only image2d_t   inputA,
+    __read_only image2d_t   inputB,
+    __write_only image2d_t  output,
+    int M,
+    int K,
+    int N,
+    int ac2zero,
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    float4 sum = (float4)(0);
+
+    for(; coord.z < K;)
+    {
+        float4 tempA0;
+        float4 tempB0;
+
+        tempA0 = read_imagef(inputA, coord.zy);
+        tempB0 = read_imagef(inputB, coord.zx);
+        coord.z++;
+
+        sum = sum + tempA0 * tempB0;
+    }
+    write_imagef(output, coord.xy, sum);
+}
+
+__kernel void gemm_transb_F32F32toF32_3D(
+    __read_only image2d_array_t   inputA,
+    __read_only image2d_array_t   inputB,
+    __write_only image2d_array_t  output,
+    int M,
+    int K,
+    int N,
+    int ac2zero,
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
+    )
+{
+    int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);
+
+    float4 sum = (float4)(0);
+
+    for(; coord_a.x < K;)
+    {
+        float4 tempA0;
+        float4 tempB0;
+
+        tempA0 = read_imagef(inputA, coord_a);
+        tempB0 = read_imagef(inputB, coord_b);
+        coord_a.x++;
+        coord_b.x++;
+
+        sum = sum + tempA0 * tempB0;
+    }
+
+    coord_a.x = get_global_id(0);
+    coord_a.z = get_global_id(2);
+    write_imagef(output, coord_b, sum);
+}
+
+__kernel void gemm_transb_F32I8toF32_2D(
+    __read_only image2d_t   inputA,
+    __read_only image2d_t   inputB,
+    __write_only image2d_t  output,
+    int M,
+    int K,
+    int N,
+    int ac2zero,
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    float4 sum = (float4)(0);
+    for(; coord.z < K;)
+    {
+        float4 tempA0;
+        float4 tempB0;
+
+        tempA0 = read_imagef(inputA, coord.zy);
+        tempB0 = convert_float4(read_imagei(inputB, coord.zx));
+        coord.z++;
+        tempB0.x = (tempB0.x - zp_b) * scale_b;
+
+        sum = sum + tempA0 * tempB0;
+    }
+
+    write_imagef(output, coord.xy, sum);
+}
+
+__kernel void gemm_transb_F32I8toF32_3D(
+    __read_only image2d_array_t   inputA,
+    __read_only image2d_array_t   inputB,
+    __write_only image2d_array_t  output,
+    int M,
+    int K,
+    int N,
+    int ac2zero,
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
+    )
+{
+    int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);
+
+    float4 sum = (float4)(0);
+
+    for(; coord_a.x < K;)
+    {
+        float4 tempA0;
+        float4 tempB0;
+
+        tempA0 = read_imagef(inputA, coord_a);
+        tempB0 = convert_float4(read_imagei(inputB, coord_b));
+        tempB0.x = (tempB0.x - zp_b) * scale_b;
+        coord_a.x++;
+        coord_b.x++;
+
+        sum = sum + tempA0 * tempB0;
+    }
+
+    coord_a.x = get_global_id(0);
+    coord_a.z = get_global_id(2);
+    write_imagef(output, coord_b, sum);
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl
@ -6,32 +6,30 @@ __kernel void gemm_transa_F32F32toF32_2D(
    int K,
    int N,
    int ac2zero,
-    int bc2zero
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
    )
 {
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    int2 coord_a = (int2)(gidy, 0);
-    int2 coord_b = (int2)(gidx, 0);
-
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
    float4 sum = (float4)(0);

-    for(; coord_a.y < K;)
+    for(; coord.z < K;)
    {
        float4 tempA0;
        float4 tempB0;

-        tempA0 = read_imagef(inputA, coord_a);
-        tempB0 = read_imagef(inputB, coord_b);
-        coord_a.y++;
-        coord_b.y++;
+        tempA0 = read_imagef(inputA, coord.yz);
+        tempB0 = read_imagef(inputB, coord.xz);
+        coord.z++;

-        sum += tempA0 * tempB0;
+        sum = sum + tempA0 * tempB0;
    }
-
-    coord_b.y = gidy;
-    write_imagef(output, coord_b, sum);
+    write_imagef(output, coord.xy, sum);
 }

 __kernel void gemm_transa_F32F32toF32_3D(
@ -42,7 +40,13 @@ __kernel void gemm_transa_F32F32toF32_3D(
    int K,
    int N,
    int ac2zero,
-    int bc2zero
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
    )
 {
    int gidx = get_global_id(0);
@ -63,7 +67,7 @@ __kernel void gemm_transa_F32F32toF32_3D(
        coord_a.y++;
        coord_b.y++;

-        sum += tempA0 * tempB0;
+        sum = sum + tempA0 * tempB0;
    }

    coord_b.y = gidy;
--- a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
@ -0,0 +1,108 @@
+inline float roi_align_1x1
+(
+    __read_only  image2d_array_t  input,
+                           float2 region_start,
+                           float2 region_end,
+                           float2 bin_size,
+                           int2   grid_size,
+                           float2 rcp_of_grid_size,
+                           int    pz
+)
+{
+    float sum = 0;
+
+    for(int iy = 0; iy < grid_size.y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size.x; ++ix)
+        {
+            float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);
+            float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;
+
+            int2 xy_low  = convert_int2(pos);
+            int2 xy_high = xy_low + 1;
+
+            float ly = pos.y - xy_low.y;
+            float lx = pos.x - xy_low.x;
+            float hy = 1.0f - ly;
+            float hx = 1.0f - lx;
+
+            float w1 = hy * hx;
+            float w2 = hy * lx;
+            float w3 = ly * hx;
+            float w4 = ly * lx;
+
+            float data1 = read_imagef(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;
+            float data2 = read_imagef(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;
+            float data3 = read_imagef(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;
+            float data4 = read_imagef(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;
+
+            sum = sum + w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+        }
+    }
+
+    return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);
+}
+
+
+#define EPS_GRID 0.00001f
+__kernel void roi_align_F32toF32
+(
+    __read_only  image2d_array_t  input,
+    __read_only  image2d_t        rois,
+    __read_only  image2d_t        n_rois,
+    __write_only image2d_array_t  output,
+                           float  spatial_x_scale,
+                           float  spatial_y_scale,
+                           float  in_width,
+                           float  in_height,
+                           float  rcp_of_out_width,
+                           float  rcp_of_out_height,
+                           float  sampling_x_ratio,
+                           float  sampling_y_ratio,
+                           int    depth
+)
+{
+    int px = get_global_id(0);
+    int py = get_global_id(1);
+    int pw = get_global_id(2);
+
+    int roi_batch = read_imagei(n_rois, (int2)(pw, 0)).x;
+    float4 roi_x = read_imagef(rois, (int2)(0, pw));
+    float4 roi_y = read_imagef(rois, (int2)(1, pw));
+    float4 roi_z = read_imagef(rois, (int2)(2, pw));
+    float4 roi_w = read_imagef(rois, (int2)(3, pw));
+    float4 roi = (float4)(roi_x.x, roi_y.x, roi_z.x, roi_w.x);
+
+    float4 roi_anchor = roi * (float4)(spatial_x_scale, spatial_y_scale, spatial_x_scale, spatial_y_scale);
+    float2 roi_dims = fmax(roi_anchor.zw - roi_anchor.xy, 1.0f);
+
+    float2 spatial_indx     = (float2)(px, py);
+    float2 pooled_dims      = (float2)(rcp_of_out_width, rcp_of_out_height);
+    float2 max_spatial_dims = (float2)(in_width, in_height);
+
+    float2 bin_size     = roi_dims * pooled_dims;
+    float2 region_start = spatial_indx * bin_size + roi_anchor.xy;
+    float2 region_end   = region_start + bin_size;
+
+    float2 roi_bin_grid = (float2)(sampling_x_ratio, sampling_y_ratio);
+
+    roi_bin_grid = roi_bin_grid == 0 ? ceil(bin_size - EPS_GRID) : roi_bin_grid;
+
+    int kz = roi_batch * depth;
+    float2 rcp_of_grid_size = 1.0f / roi_bin_grid;
+    int2 grid_size_xy = convert_int2(roi_bin_grid);
+    float4 interp;
+    int kz1 = pw * depth;
+    for (int pz = 0; pz < depth; pz ++, kz ++, kz1 ++)
+    {
+        interp.x = roi_align_1x1( input,
+                       region_start,
+                       region_end,
+                       bin_size,
+                       grid_size_xy,
+                       rcp_of_grid_size,
+                       kz);
+
+        write_imagef(output, (int4)(px, py, kz1, 0), interp);
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/space2depth_internal.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/space2depth_internal.cl
@ -0,0 +1,90 @@
+
+__kernel void space2depth_internal_F32toF32 (
+        image2d_array_t    input,
+        image2d_array_t    output,
+        int block_size_x, int block_size_y,
+        float  scaleInOut, float zpInOut)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+    int inDepth = get_image_array_size(input);
+
+    int4 coord = (int4)(x, y, z, 0);
+    float4 data = {0.0};
+    data = read_imagef(input, coord);
+
+    ushort blockSize_x = convert_ushort(block_size_x);
+    ushort blockSize_y = convert_ushort(block_size_y);
+    int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0);
+    coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth
+                     + z;
+    write_imagef(output, coord_out, data);
+}
+
+__kernel void space2depth_internal_F32toF32_X2Y1 (
+        image2d_array_t    input,
+        image2d_array_t    output,
+        int block_size_x, int block_size_y,
+        float  scaleInOut, float zpInOut)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+    int inDepth = get_image_array_size(input);
+
+    int4 coord = (int4)(x, y, z, 0);
+    float4 data = {0.0};
+    data = read_imagef(input, coord);
+
+    int4 coord_out = (int4)(x >> 1, y, 0, 0);
+    coord_out.z = (x & 1) * inDepth + z;
+    write_imagef(output, coord_out, data);
+}
+
+__kernel void space2depth_internal_U8toU8 (
+        image2d_array_t    input,
+        image2d_array_t    output,
+        int block_size_x, int block_size_y,
+        float  scaleInOut, float zpInOut)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+    int inDepth = get_image_array_size(input);
+
+    int4 coord = (int4)(x, y, z, 0);
+    uint4 data = {0};
+    data = read_imageui(input, coord);
+
+    ushort blockSize_x = convert_ushort(block_size_x);
+    ushort blockSize_y = convert_ushort(block_size_y);
+    int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0);
+    coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth
+                    + z;
+
+    data.x = convert_uint(data.x * scaleInOut + zpInOut);
+    write_imageui(output, coord_out, data);
+}
+
+__kernel void space2depth_internal_U8toU8_X2Y1 (
+        image2d_array_t    input,
+        image2d_array_t    output,
+        int block_size_x, int block_size_y,
+        float  scaleInOut, float zpInOut)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+    int inDepth = get_image_array_size(input);
+
+    int4 coord = (int4)(x, y, z, 0);
+    uint4 data = {0};
+    data = read_imageui(input, coord);
+
+    int4 coord_out = (int4)(x >> 1, y, 0, 0);
+    coord_out.z = (x & 1) * inDepth + z;
+
+    data.x = convert_uint(data.x * scaleInOut + zpInOut);
+    write_imageui(output, coord_out, data);
+}
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c
@ -1,253 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-void myTensorCropFunc
-    (
-    int8_t *src,
-    int8_t *dst
-    )
-{
-
-    return;
-}
-vsi_status VX_CALLBACK TensorCropInternalKernel
-    (vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if(paramNum == 2)
-    {
-
-    }
-
-    return status;
-}
-
-vsi_status VX_CALLBACK TensorCropInitializer
-    (vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    vsi_status status   = VX_SUCCESS;
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        3,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in threads
-        {0, 0, 0}}; // globalWorkSize: image size in threads
-
-    vx_tensor input             = (vx_tensor)paramObj[0];
-    vx_tensor output            = (vx_tensor)paramObj[1];
-    uint32_t output_size[4]     = {1, 1, 1, 1};
-    vsi_enum dataFormat, dstFormat;
-    int8_t  input_fixPointPos   = 0;
-    vx_uint32 i  =  0;
-    int32_t offset[3];
-    size_t size[DIM_SIZE];
-    vsi_nn_tensor_attr_t attr[2];
-
-    memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-
-    status  = vsi_nn_vxGetTensorAttr(input, &attr[0]);
-    status |= vsi_nn_vxGetTensorAttr(output, &attr[1]);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-
-    dataFormat        = attr[0].dtype.vx_type;
-    input_fixPointPos = attr[0].dtype.fl;
-    dstFormat         = attr[1].dtype.vx_type;
-    for (i = 0; i < attr[1].dim_num; i++)
-    {
-        output_size[i] = attr[1].size[i];
-    }
-
-    vxCopyScalar((vx_scalar)paramObj[2], &offset[0], VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[3], &offset[1], VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[4], &offset[2], VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    memset(size, 0, sizeof(size_t) * DIM_SIZE);
-    switch(dstFormat)
-    {
-    case VSI_NN_TYPE_INT8:
-    case VSI_NN_TYPE_UINT8:
-        size[0] = 16;
-        size[1] = 4;
-        break;
-    case VSI_NN_TYPE_INT16:
-    case VSI_NN_TYPE_UINT16:
-    case VSI_NN_TYPE_FLOAT16:
-        size[0] = 8;
-        size[1] = 4;
-        break;
-    }
-
-    shaderParam.globalWorkOffset[0] = offset[0];
-    shaderParam.globalWorkOffset[1] = offset[1];
-    shaderParam.globalWorkOffset[2] = offset[2];
-    shaderParam.globalWorkScale[0]  = size[0];
-    shaderParam.globalWorkScale[1]  = size[1];
-    shaderParam.globalWorkScale[2]  = 1;
-    shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1)
-        / shaderParam.globalWorkScale[0], 4);
-    shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1];
-    shaderParam.globalWorkSize[2] = output_size[2];
-
-    if(dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        vx_uint32 uniConvertInt16toFp16_2x8[16] = {
-            0x11111111, // TCfg
-            0x00000000, // ASelt
-            0x03020100, 0x07060504, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        };
-
-#define cropMIN(x, y)            (((x) <= (y)) ?  (x) :  (y))
-#define CROP_MAX_POST_SHIFT_BITS     (31)
-#define CROP_MAX_MULTIPLIER_NUM      (65535)
-
-        if (input_fixPointPos > 0)
-        {
-            vx_uint8  postshift      = cropMIN(input_fixPointPos, CROP_MAX_POST_SHIFT_BITS);
-
-            uniConvertInt16toFp16_2x8[7] |= (postshift & 0x1F);
-        }
-        else
-        {
-            vx_uint32 multiplier = cropMIN((int64_t)1 << (-input_fixPointPos), CROP_MAX_MULTIPLIER_NUM);
-
-            for (i = 0; i < 8; i++)
-            {
-                uniConvertInt16toFp16_2x8[i + 8] = multiplier;
-            }
-        }
-#undef cropMIN
-#undef CROP_MAX_POST_SHIFT_BITS
-#undef CROP_MAX_MULTIPLIER_NUM
-
-        status |= vxSetNodeUniform(nodObj, "uniConvertInt16toFp16_2x8", 1, uniConvertInt16toFp16_2x8);
-    }
-
-    vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    if(status < 0)
-    {
-        VSILOGE("[%s : %d]Initializer  failure! \n",__FILE__, __LINE__);
-    }
-    return status;
-}
-
-vx_param_description_t basekernel_tensorCrop_params[] = {
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxTensorCropKernelInt16Info =
-{
-    VX_KERNEL_ENUM_TENSORCROP_INT16,
-    VX_KERNEL_NAME_TENSORCROP_INT16,
-    NULL,
-    basekernel_tensorCrop_params,
-    (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    TensorCropInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTensorCropKernelInt8Info =
-{
-    VX_KERNEL_ENUM_TENSORCROP_INT8,
-    VX_KERNEL_NAME_TENSORCROP_INT8,
-    NULL,
-    basekernel_tensorCrop_params,
-    (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    TensorCropInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTensorCropKernelInt16Fp16Info =
-{
-    VX_KERNEL_ENUM_TENSORCROP_INT16_FP16,
-    VX_KERNEL_NAME_TENSORCROP_INT16_FP16,
-    NULL,
-    basekernel_tensorCrop_params,
-    (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    TensorCropInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_CROP_list[] =
-{
-    NULL,
-    &vxTensorCropKernelInt16Info,
-    &vxTensorCropKernelInt8Info,
-    &vxTensorCropKernelInt16Fp16Info,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c
@ -1,323 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_FCL2)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_FULLYCONNECTED_AXIS2)
-#define _VX_KERNEL_NAME         ("vsi_nn_kernel_fullconnect2")
-#define _VX_KERNEL_FUNC_KERNEL  (vxFullconnect2Kernel)
-
-//static uint32_t layerNum = 0;
-
-static vsi_status VX_CALLBACK vxFullconnect2Kernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    /* TODO: */
-#define ARG_NUM            (2)
-#define TENSOR_NUM_INPUT (3)
-#define TENSOR_NUM_OUTPUT (1)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VX_SUCCESS;
-    uint32_t  i, j, k;
-    vx_context context = NULL;
-    vsi_nn_tensor_attr_t attr[TENSOR_NUM];
-    uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
-    vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
-    uint8_t *buffer_ptr[TENSOR_NUM] = {NULL};
-    vx_tensor tensor[TENSOR_NUM];
-
-    //char fileName[256] = {'\0'};
-    //uint32_t total_size;
-    int32_t axis, weights;
-    uint32_t num_fc = 1, num_no_fc = 1;
-
-
-    //prepare data
-    context = vxGetContext((vx_reference)node);
-
-    for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
-    }
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
-    }
-
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(axis),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(weights),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    //op calc
-    for(i = 0; i <= (uint32_t)axis; ++i)
-    {
-        num_fc *= attr[0].size[i];
-    }
-    for(i = axis + 1; i < attr[0].dim_num; ++i)
-    {
-        num_no_fc *= attr[0].size[i];
-    }
-
-    for(k = 0; k < num_no_fc; ++k)
-    {
-        for(j = 0; j < (uint32_t)weights; ++j)
-        {
-            float sum;
-            vsi_nn_DtypeToFloat32(&buffer_ptr[2][stride_size[2][0] * j], &sum, &attr[2].dtype);
-            for(i = 0; i < num_fc; ++i)
-            {
-                float x, w;
-                vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * (i + num_fc * k)],
-                    &x, &attr[0].dtype);
-                vsi_nn_DtypeToFloat32(&buffer_ptr[1][stride_size[1][0] * (i + num_fc * j)],
-                    &w, &attr[1].dtype);
-                sum += w * x;
-            }
-            vsi_nn_Float32ToDtype(sum, &buffer_ptr[3][stride_size[3][0] * (j + weights * k)],
-                &attr[3].dtype);
-        }
-    }
-
-#if 0
-    print_index = 3;
-    total_size = vsi_nn_ShapeProduct(size[print_index], dim_num[print_index]);
-    if (dim_num[print_index] == 3)
-    {
-        snprintf(fileName, VSI_NN_MAX_PATH, "%s_%d_%d_%d_%d.txt", _VX_KERNEL_NAME, layerNum,
-            size[print_index][0], size[print_index][1], size[print_index][2]);
-    }
-    else
-    {
-        snprintf(fileName, VSI_NN_MAX_PATH, "%s_%d_%d_%d_%d_%d.txt", _VX_KERNEL_NAME, layerNum,
-            size[print_index][0], size[print_index][1], size[print_index][2], size[print_index][3]);
-    }
-    vsi_nn_SaveDataToText(fileName, buffer_ptr[print_index], total_size,
-        data_format[print_index], NULL);
-    layerNum++;
-#endif
-    //save data
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
-        if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
-    }
-    for( i = 0; i < TENSOR_NUM; i ++ )
-    {
-        if (buffer_ptr[i]) free(buffer_ptr[i]);
-    }
-
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-};
-
-void myFullyConnected_Axis2Func
-    (
-    int8_t *src,
-    int8_t *dst
-    )
-{
-
-    return;
-}
-vsi_status VX_CALLBACK vxFullyConnected_Axis2Kernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if(paramNum == 2)
-    {
-
-    }
-
-    return status;
-}
-
-vsi_status VX_CALLBACK vxFullyConnected_Axis2Initializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    vsi_status status   = VX_SUCCESS;
-
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        2,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in threads
-        {0, 0, 0}}; // globalWorkSize: image size in threads
-
-    uint32_t       input_size[DIM_SIZE] = {1, 1, 1, 1};
-    uint32_t       output_size[DIM_SIZE] = {1, 1, 1, 1};
-
-    uint32_t uniMulAcc_16x1[16] = {
-        0x00005555, // TCfg
-        0x00000000, // ASelt
-        0x76543210, 0x00000000, // ABin
-        0x00005555, // BSelt
-        0x76543210, 0x00000000, // BBin
-        0x00000400, // AccumType, ConstantType, and PostShift
-        0x00000000, 0x00000000, 0x00000000, 0x00000000,
-        0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-    };
-    uint32_t loopNum = 0;
-    vsi_nn_tensor_attr_t attr[2];
-    uint32_t i;
-    uint32_t input_dims      = 0;
-    uint32_t output_dims     = 0;
-
-    memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-
-    status  = vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[1], &attr[0]);
-    status |= vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[3], &attr[1]);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-        return status;
-    }
-    input_dims  = attr[0].dim_num;
-    for (i = 0; i < input_dims; i++)
-    {
-        input_size[i] = attr[0].size[i];
-    }
-    output_dims  = attr[1].dim_num;
-    for (i = 0; i < output_dims; i++)
-    {
-        output_size[i] = attr[1].size[i];
-    }
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkScale[0] = 1;
-    shaderParam.globalWorkScale[1] = 1;
-    shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1)
-        / shaderParam.globalWorkScale[0], 4);
-    shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1];
-
-    vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    vxSetNodeUniform(nodObj, "uniMulAcc_16x1", 1, uniMulAcc_16x1);
-
-    loopNum = gcmALIGN(input_size[0], 32);
-    vxSetNodeUniform(nodObj, "loopNum", 1, &loopNum);
-    if(status < 0)
-    {
-        VSILOGE("[%s : %d]Initializer  failure! \n",__FILE__, __LINE__);
-    }
-    return status;
-}
-
-static vx_param_description_t vxFullyConnected_Axis2KernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxFullyConnected_Axis2KernelInfo =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    vxFullyConnected_Axis2Kernel,
-    vxFullyConnected_Axis2KernelParam,
-    (sizeof(vxFullyConnected_Axis2KernelParam) / sizeof(vxFullyConnected_Axis2KernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxFullyConnected_Axis2Initializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_FCL2_list[] =
-{
-    &_VX_KERNEL_VAR,
-    &vxFullyConnected_Axis2KernelInfo,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c
@ -1,688 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-void myLayerNormFunc
-    (
-    void* src,
-    int16_t* scale,
-    float*   bias,
-    float    eps,
-    void* dst,
-    uint32_t input_dim,
-    uint32_t width,
-    uint32_t height,
-    uint32_t channel,
-    uint32_t batch
-    )
-{
-    uint32_t ch = (input_dim <= 2) ? 1 : channel;
-    uint32_t bn = (input_dim <= 3) ? 1 : batch;
-    uint32_t b = 0, c = 0, h = 0, w = 0;
-
-    int16_t* imgIn, *imgOut;
-    imgIn = (int16_t*)src;
-    imgOut = (int16_t*)dst;
-
-    VSILOGI("Hello myLayerNormFunc!\n");
-    for (b = 0; b < bn; b++)
-    {
-        for (c = 0; c < ch; c++)
-        {
-            for (h = 0; h < height; h++)
-            {
-                uint32_t len = (h + (c + b*ch)*height) * width;
-                float sum = .0f;
-                float sumsq = .0f;
-                float mean = .0f;
-                float vari = .0f;
-
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    sum += vsi_nn_Fp16toFp32(imgIn[index]);
-                }
-                mean = sum / width;
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
-                    sumsq += data * data;
-                }
-                vari = sumsq / width;
-                vari = (float)(1.0 / sqrtf(vari + eps));
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
-                    float scaleVal = vsi_nn_Fp16toFp32(scale[w]);
-                    float biasVal = bias[w];
-                    float normVal = data * vari * scaleVal + biasVal;
-                    imgOut[index] = vsi_nn_Fp32ToFp16(normVal);
-                }
-            }
-        }
-    }
-    return;
-}
-void myLayerNormFunc_u8
-    (
-    void* src,
-    int16_t* scale,
-    float*   bias,
-    float    eps,
-    void* dst,
-    uint32_t input_dim,
-    uint32_t width,
-    uint32_t height,
-    uint32_t channel,
-    uint32_t batch,
-    int32_t inZp,
-    int32_t outZp,
-    float inScale,
-    float outScale
-    )
-{
-    uint32_t ch = (input_dim <= 2) ? 1 : channel;
-    uint32_t bn = (input_dim <= 3) ? 1 : batch;
-    uint32_t b = 0, c = 0, h = 0, w = 0;
-
-    uint8_t* imgIn, *imgOut;
-    imgIn = (uint8_t*)src;
-    imgOut = (uint8_t*)dst;
-
-    VSILOGI("Hello myLayerNormFunc!\n");
-    for (b = 0; b < bn; b++)
-    {
-        for (c = 0; c < ch; c++)
-        {
-            for (h = 0; h < height; h++)
-            {
-                uint32_t len = (h + (c + b*ch)*height) * width;
-                float sum = .0f;
-                float sumsq = .0f;
-                float mean = .0f;
-                float vari = .0f;
-
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    //sum += vsi_nn_Fp16toFp32(imgIn[index]);
-                    sum += vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8);
-                }
-                mean = sum / width;
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    //float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
-                    float data = vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8) - mean;
-                    sumsq += data * data;
-                }
-                vari = sumsq / width;
-                vari = (float)(1.0 / sqrtf(vari + eps));
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    //float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
-                    float data = vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8) - mean;
-                    float scaleVal = vsi_nn_Fp16toFp32(scale[w]);
-                    float biasVal = bias[w];
-                    float normVal = data * vari * scaleVal + biasVal;
-                    //imgOut[index] = vsi_nn_Fp32ToFp16(normVal);
-                    imgOut[index] = (vx_uint8)vsi_nn_Fp32ToAffine(normVal, outScale, outZp, VSI_NN_TYPE_UINT8);
-                }
-            }
-        }
-    }
-    return;
-}
-vsi_status VX_CALLBACK vxLayerNormKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if(paramNum == 5)
-    {
-        vx_context context = NULL;
-        // tensor
-        vx_tensor imgObj[4] = { NULL };
-        vsi_nn_tensor_attr_t attr[4];
-        int16_t *input = NULL, *output = NULL, *scale = NULL;
-        float *bias = NULL;
-        uint32_t input_size[4] = {1, 1, 1, 1}, output_size[4] = {1, 1, 1, 1};
-        uint32_t scale_size[4] = {1, 1, 1, 1}, bias_size[4] = {1, 1, 1, 1};
-        uint32_t input_stride_size[4]  = {0};
-        uint32_t output_stride_size[4] = {0};
-        uint32_t scale_stride_size[4]  = {0};
-        uint32_t bias_stride_size[4] = {0};
-        vx_tensor_addressing input_user_addr = NULL;
-        vx_tensor_addressing output_user_addr = NULL;
-        vx_tensor_addressing scale_user_addr = NULL;
-        vx_tensor_addressing bias_user_addr = NULL;
-        vsi_nn_type_e inputFormat = VSI_NN_TYPE_FLOAT16, outputFormat = VSI_NN_TYPE_FLOAT16;
-        vsi_nn_type_e scaleFormat = VSI_NN_TYPE_FLOAT16, biasFormat = VSI_NN_TYPE_FLOAT16;
-        uint32_t input_dims = 0, output_dims = 0;
-        uint32_t scale_dims = 0, bias_dims = 0;
-        uint32_t i;
-        int32_t in_zp, out_zp;
-        float in_scale, out_scale;
-        // scalar
-        vx_scalar scalar[1] = { NULL };
-        float eps = .0f;
-
-        imgObj[0] = (vx_tensor)paramObj[0];
-        imgObj[1] = (vx_tensor)paramObj[1];
-        imgObj[2] = (vx_tensor)paramObj[2];
-        imgObj[3] = (vx_tensor)paramObj[3];
-        scalar[0] = (vx_scalar)paramObj[4];
-        memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-        memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-        memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t));
-        memset(&attr[3], 0, sizeof(vsi_nn_tensor_attr_t));
-        context = vxGetContext((vx_reference)node);
-        if (context == NULL)
-        {
-            VSILOGE("vxGetContext failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-
-        status  = vsi_nn_vxGetTensorAttr(imgObj[0], &attr[0]);
-        status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]);
-        status |= vsi_nn_vxGetTensorAttr(imgObj[2], &attr[2]);
-        status |= vsi_nn_vxGetTensorAttr(imgObj[3], &attr[3]);
-        if (status != VX_SUCCESS)
-        {
-            VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-        input_dims  = attr[0].dim_num;
-        inputFormat = attr[0].dtype.vx_type;
-        for (i = 0; i < input_dims; i++)
-        {
-            input_size[i] = attr[0].size[i];
-        }
-        in_zp = attr[0].dtype.zero_point;
-        in_scale = attr[0].dtype.scale;
-
-        //bias
-        bias_dims  = attr[1].dim_num;
-        biasFormat = attr[1].dtype.vx_type;
-        for (i = 0; i < bias_dims; i++)
-        {
-            bias_size[i] = attr[1].size[i];
-        }
-        //scale
-        scale_dims  = attr[2].dim_num;
-        scaleFormat = attr[2].dtype.vx_type;
-        for (i = 0; i < scale_dims; i++)
-        {
-            scale_size[i] = attr[2].size[i];
-        }
-
-        //output
-        output_dims  = attr[3].dim_num;
-        outputFormat = attr[3].dtype.vx_type;
-        for (i = 0; i < output_dims; i++)
-        {
-            output_size[i] = attr[3].size[i];
-        }
-        out_zp = attr[3].dtype.zero_point;
-        out_scale = attr[3].dtype.scale;
-
-        input_size[2] = (input_dims <= 2)?1:input_size[2];
-        input_size[3] = (input_dims <= 3)?1:input_size[3];
-
-        input_stride_size[0]  = vsi_nn_GetTypeBytes(inputFormat);
-        output_stride_size[0] = vsi_nn_GetTypeBytes(outputFormat);
-        for (i=1; i< input_dims; i++)
-        {
-            input_stride_size[i]  = input_stride_size[i-1] * input_size[i-1];
-            output_stride_size[i] = output_stride_size[i-1] * output_size[i-1];
-        }
-        input  = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t));
-        output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t));
-        input_user_addr = vxCreateTensorAddressing(context, input_size, input_stride_size, (vx_uint8)input_dims);
-        vsi_nn_copy_tensor_patch(imgObj[0], &attr[0], input, VX_READ_ONLY);
-        //scale and bias
-        scale_stride_size[0]  = vsi_nn_GetTypeBytes(scaleFormat);
-        bias_stride_size[0] = vsi_nn_GetTypeBytes(biasFormat);
-        for (i=1; i< scale_dims; i++)
-        {
-            scale_stride_size[i]  = scale_stride_size[i-1] * scale_size[i-1];
-            bias_stride_size[i] = bias_stride_size[i-1] * bias_size[i-1];
-        }
-        scale  = (int16_t*)malloc(scale_size[0]*sizeof(int16_t));
-        bias = (float*)malloc(bias_size[0]*sizeof(float));
-        bias_user_addr = vxCreateTensorAddressing(context, bias_size, bias_stride_size, (vx_uint8)bias_dims);
-        vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], bias, VX_READ_ONLY);
-        scale_user_addr = vxCreateTensorAddressing(context, scale_size, scale_stride_size, (vx_uint8)scale_dims);
-        vsi_nn_copy_tensor_patch(imgObj[2], &attr[2], scale, VX_READ_ONLY);
-
-        // scalar
-        status = vxCopyScalar(scalar[0], &eps, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        if (status != VX_SUCCESS)
-        {
-            VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-        // Call C Prototype
-        if(inputFormat == VSI_NN_TYPE_FLOAT16)
-        {
-            myLayerNormFunc(input, scale, bias, eps, output, input_dims, input_size[0],
-                input_size[1], input_size[2], input_size[3]);
-        }
-        else
-        {
-            myLayerNormFunc_u8(input, scale, bias, eps, output, input_dims, input_size[0],
-                input_size[1], input_size[2], input_size[3], in_zp, out_zp, in_scale, out_scale);
-        }
-
-        //output tensor
-        output_user_addr = vxCreateTensorAddressing(context, output_size,
-            output_stride_size, (vx_uint8)output_dims);
-        vsi_nn_copy_tensor_patch(imgObj[3], &attr[3], output, VX_WRITE_ONLY);
-
-OnError:
-        if(input) free(input);
-        if(scale) free(scale);
-        if(bias) free(bias);
-        if(output) free(output);
-        if(input_user_addr) vxReleaseTensorAddressing(&input_user_addr);
-        if(scale_user_addr) vxReleaseTensorAddressing(&scale_user_addr);
-        if(bias_user_addr) vxReleaseTensorAddressing(&bias_user_addr);
-        if(output_user_addr) vxReleaseTensorAddressing(&output_user_addr);
-    }
-
-    return status;
-}
-vsi_status VX_CALLBACK vxLayerNormInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        3,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-
-    vx_tensor     input           = (vx_tensor)paramObj[0];
-    vx_tensor     scale           = (vx_tensor)paramObj[2];
-    vx_tensor     output          = (vx_tensor)paramObj[3];
-    uint32_t      input_size[4]   = {1, 1, 1, 1};
-    uint32_t      input_dims      = 0;
-    vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16;
-    vsi_nn_type_e scaleDataFormat = VSI_NN_TYPE_FLOAT16;
-    vsi_nn_type_e outputDataFormat = VSI_NN_TYPE_FLOAT16;
-    vx_float32 scaleIn = 0;
-    vx_float32 scaleOut = 0;
-    vx_float32 reScaleOut_u8 = 0;
-    vx_float32 reOutZP = 0.f;
-    int32_t output_ZP = 0;
-    int32_t input_ZP = 0;
-    vx_uint32 iter = 0;
-    int32_t sumInZp = 0;
-    int32_t tmpZp1 = 0;
-    int32_t tmpZp2 = 0;
-    vx_float32 e2InScale = 0;
-    vsi_nn_tensor_attr_t attr[3];
-    uint32_t i;
-
-    memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t));
-    status  = vsi_nn_vxGetTensorAttr(input, &attr[0]);
-    status |= vsi_nn_vxGetTensorAttr(output, &attr[1]);
-    status |= vsi_nn_vxGetTensorAttr(scale, &attr[2]);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-        return status;
-    }
-
-    input_dims  = attr[0].dim_num;
-    inputDataFormat = attr[0].dtype.vx_type;
-    for (i = 0; i < input_dims; i++)
-    {
-        input_size[i] = attr[0].size[i];
-    }
-    input_ZP = attr[0].dtype.zero_point;
-    scaleIn = attr[0].dtype.scale;
-    outputDataFormat = attr[1].dtype.vx_type;
-    output_ZP = attr[1].dtype.zero_point;
-    scaleOut = attr[1].dtype.scale;
-    scaleDataFormat = attr[2].dtype.vx_type;
-
-    if(outputDataFormat == VSI_NN_TYPE_UINT8)
-    {
-        reScaleOut_u8 = 1.0f / scaleOut;
-        reOutZP = (vx_float32)output_ZP;
-    }
-    iter = ((input_size[0] + 15) / 16) * 16;
-    sumInZp = input_ZP * iter * (-1);
-    tmpZp1 = (-2) * input_ZP;
-    tmpZp2 = iter * input_ZP * input_ZP;
-    e2InScale = scaleIn * scaleIn;
-
-    input_size[2] = (input_dims <= 2)?1:input_size[2];
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkOffset[2] = 0;
-    shaderParam.globalWorkScale[0]  = input_size[0];
-    shaderParam.globalWorkScale[1]  = 1;
-    shaderParam.globalWorkScale[2]  = 1;
-    shaderParam.globalWorkSize[0]   = 1;
-    shaderParam.globalWorkSize[1]   = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1], 4);
-    shaderParam.globalWorkSize[2]   = input_size[2];
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-    if(status < 0)
-    {
-        VSILOGE("[%s : %d]Initializer  failure! \n",__FILE__, __LINE__);
-    }
-    {
-        vx_float32 dimRatio = 1.0f / (vx_float32)input_size[0];
-        vx_uint32 uniFp16SumSqr_dp8x2[16] = {
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0x76543210, // ABin
-            0x5555aaaa, // BSelt
-            0x00000000, 0x76543210, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        };
-        vx_uint32 UniFP16toFP32Lo4_dp4x4[16] = {
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        };
-        vx_uint32 uniExtractHalf4_dp4x4[16] = {
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00020000, 0x00060004, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvertSecFp16Fp32_4x4[16] = {
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        };
-        vx_uint32 uniSumU8_16x1[16] = {
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0xfedcba98, // ABin
-            0xaaaaaaaa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
-        };
-        vx_uint32 uniSqrSum_16x1[16] = {
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0xfedcba98, // ABin
-            0x55555555, // BSelt
-            0x76543210, 0xfedcba98, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvert1stUint8SubZpToFp32_4x4[16] = {
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvert2ndUint8SubZpToFp32_4x4[16] = {
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvert3rdUint8SubZpToFp32_4x4[16] = {
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00090008, 0x000b000a, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvert4thUint8SubZpToFp32_4x4[16] = {
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x000d000c, 0x000f000e, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvertInt32toUint8_2x8[16] = {
-            0x33333333, // TCfg
-            0x11110000, // ASelt
-            0x03020100, 0x03020100, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        };
-        vx_uint32 UniPackFP16even_2x8[16] = {
-           0x11111111, // TCfg
-           0x11110000, // ASelt
-           0x06040200, 0x06040200, // ABin
-           0x22222222, // BSelt
-           0x00000000, 0x00000000, // BBin
-           0x00000100, // AccumType, ConstantType, and PostShift
-           0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
-        };
-        if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_FLOAT16
-            && scaleDataFormat == VSI_NN_TYPE_FLOAT16)
-        {
-            status  = vxSetNodeUniform(nodObj, "width", 1, &input_size[0]);
-            status |= vxSetNodeUniform(nodObj, "dimRatio", 1, &dimRatio);
-            status |= vxSetNodeUniform(nodObj, "UniFP16toFP32Lo4_dp4x4", 1, UniFP16toFP32Lo4_dp4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvertSecFp16Fp32_4x4", 1, uniConvertSecFp16Fp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniSumU8_16x1", 1, uniSumU8_16x1);
-            status |= vxSetNodeUniform(nodObj, "uniSqrSum_16x1", 1, uniSqrSum_16x1);
-            status |= vxSetNodeUniform(nodObj, "uniConvert1stUint8SubZpToFp32_4x4", 1, uniConvert1stUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert2ndUint8SubZpToFp32_4x4", 1, uniConvert2ndUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert3rdUint8SubZpToFp32_4x4", 1, uniConvert3rdUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert4thUint8SubZpToFp32_4x4", 1, uniConvert4thUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "inputZP", 1, &input_ZP);
-            status |= vxSetNodeUniform(nodObj, "input_scale", 1, &scaleIn);
-            status |= vxSetNodeUniform(nodObj, "sumInZp", 1, &sumInZp);
-            status |= vxSetNodeUniform(nodObj, "tmpZp1", 1, &tmpZp1);
-            status |= vxSetNodeUniform(nodObj, "tmpZp2", 1, &tmpZp2);
-            status |= vxSetNodeUniform(nodObj, "e2InScale", 1, &e2InScale);
-            status |= vxSetNodeUniform(nodObj, "UniPackFP16even_2x8", 1, UniPackFP16even_2x8);
-        }
-        else
-        {
-            status  = vxSetNodeUniform(nodObj, "uniFp16SumSqr_dp8x2", 1, uniFp16SumSqr_dp8x2);
-            status |= vxSetNodeUniform(nodObj, "width", 1, &input_size[0]);
-            status |= vxSetNodeUniform(nodObj, "dimRatio", 1, &dimRatio);
-            status |= vxSetNodeUniform(nodObj, "UniFP16toFP32Lo4_dp4x4", 1, UniFP16toFP32Lo4_dp4x4);
-            status |= vxSetNodeUniform(nodObj, "uniExtractHalf4_dp4x4", 1, uniExtractHalf4_dp4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvertInt32toUint8_2x8", 1, uniConvertInt32toUint8_2x8);
-            status |= vxSetNodeUniform(nodObj, "uniConvertSecFp16Fp32_4x4", 1, uniConvertSecFp16Fp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniSumU8_16x1", 1, uniSumU8_16x1);
-            status |= vxSetNodeUniform(nodObj, "uniSqrSum_16x1", 1, uniSqrSum_16x1);
-            status |= vxSetNodeUniform(nodObj, "uniConvert1stUint8SubZpToFp32_4x4", 1, uniConvert1stUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert2ndUint8SubZpToFp32_4x4", 1, uniConvert2ndUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert3rdUint8SubZpToFp32_4x4", 1, uniConvert3rdUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert4thUint8SubZpToFp32_4x4", 1, uniConvert4thUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "inputZP", 1, &input_ZP);
-            status |= vxSetNodeUniform(nodObj, "output_ZP", 1, &output_ZP);
-            status |= vxSetNodeUniform(nodObj, "input_scale", 1, &scaleIn);
-            status |= vxSetNodeUniform(nodObj, "outputScale", 1, &reScaleOut_u8);
-            status |= vxSetNodeUniform(nodObj, "outputZP", 1, &reOutZP);
-            status |= vxSetNodeUniform(nodObj, "sumInZp", 1, &sumInZp);
-            status |= vxSetNodeUniform(nodObj, "tmpZp1", 1, &tmpZp1);
-            status |= vxSetNodeUniform(nodObj, "tmpZp2", 1, &tmpZp2);
-            status |= vxSetNodeUniform(nodObj, "e2InScale", 1, &e2InScale);
-        }
-        if(status < 0)
-        {
-            VSILOGE("[%s : %d]Initializer  failure! \n",__FILE__, __LINE__);
-        }
-    }
-    return status;
-}
-static vx_param_description_t vxLayerNormKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxLayerNormKernelInfo =
-{
-    VX_KERNEL_ENUM_LAYERNORM,
-    VX_KERNEL_NAME_LAYERNORM,
-    NULL,
-    vxLayerNormKernelParam,
-    (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxLayerNormInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxLayerNormKernelInfo_u8 =
-{
-    VX_KERNEL_ENUM_LAYERNORM,
-    VX_KERNEL_NAME_LAYERNORM_UINT8,
-    NULL,
-    vxLayerNormKernelParam,
-    (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxLayerNormInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxLayerNormKernelInfo_FP16toU8 =
-{
-    VX_KERNEL_ENUM_LAYERNORM_FP16TOU8,
-    VX_KERNEL_NAME_LAYERNORM_FP16TOU8,
-    NULL,
-    vxLayerNormKernelParam,
-    (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxLayerNormInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxLayerNormKernelInfo_U8toFP16 =
-{
-    VX_KERNEL_ENUM_LAYERNORM,
-    VX_KERNEL_NAME_LAYERNORM_U8TOFP16,
-    NULL,
-    vxLayerNormKernelParam,
-    (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxLayerNormInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxLayerNormKernelInfo_CPU =
-{
-    VX_KERNEL_ENUM_LAYERNORM,
-    VX_KERNEL_NAME_LAYERNORM,
-    vxLayerNormKernel,
-    vxLayerNormKernelParam,
-    (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_LAYERNORM_list[] =
-{
-    &vxLayerNormKernelInfo_CPU,
-    &vxLayerNormKernelInfo,
-    &vxLayerNormKernelInfo_u8,
-    &vxLayerNormKernelInfo_FP16toU8,
-    &vxLayerNormKernelInfo_U8toFP16,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c
@ -1,190 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_REDUCE)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_REDUCE)
-#define _VX_KERNEL_NAME         ("vsi_nn_kernel_reduce")
-#define _VX_KERNEL_FUNC_KERNEL  (vxReduceKernel)
-
-static vx_status VX_CALLBACK vxReduceKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    vx_uint32 paramNum
-    )
-{
-    /* TODO: */
-#define ARG_NUM            (6)
-#define TENSOR_NUM_INPUT (1)
-#define TENSOR_NUM_OUTPUT (1)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vx_status status = VX_SUCCESS;
-    vx_context context = NULL;
-    vsi_nn_tensor_attr_t attr[TENSOR_NUM];
-    vx_uint32 stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
-    vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
-    vx_uint8 *buffer_ptr[TENSOR_NUM] = {NULL};
-    vx_tensor tensor[TENSOR_NUM];
-
-    vx_float32 factor0;
-    vx_int32 factor;
-    vx_uint32 batch, c, h, w;
-    vx_uint32 i, j, k, b;
-
-    //prepare data
-    context = vxGetContext((vx_reference)node);
-
-    for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
-    }
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
-    }
-
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(factor0), VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    //op calc
-    if (factor0 > 1)
-    {
-        factor = (vx_int32)(factor0 + 0.5);
-        w = attr[0].size[0];
-        h = attr[0].size[1];
-        c = attr[0].size[2];
-        batch = 1;
-        for(b = 0; b < batch; ++b){
-            for(k = 0; k < c; ++k){
-                for(j = 0; j < h*factor; ++j){
-                    for(i = 0; i < w*factor; ++i){
-                        vx_int32 in_index = b*w*h*c + k*w*h + (j/factor)*w + i/factor;
-                        vx_int32 out_index = b*w*h*c*factor*factor + k*w*h*factor*factor +
-                            j*w*factor + i;
-                        vx_float32 fval;
-                        //out[out_index] = in[in_index];
-                        vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index],
-                            &fval, &attr[0].dtype);
-                        vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
-                            &attr[1].dtype);
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        factor = (vx_int32)(1 / factor0 + 0.5);
-        w = attr[1].size[0];
-        h = attr[1].size[1];
-        c = attr[1].size[2];
-        batch = 1;
-        for(b = 0; b < batch; ++b){
-            for(k = 0; k < c; ++k){
-                for(j = 0; j < h; ++j){
-                    for(i = 0; i < w; ++i){
-                        vx_int32 in_index = b*w*h*c*factor*factor +
-                            k*w*h*factor*factor + j*w*factor*factor + i*factor;
-                        vx_int32 out_index = b*w*h*c + k*w*h + j * w + i;
-                        vx_float32 fval;
-                        //out[out_index] = in[in_index];
-                        vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], &fval,
-                            &attr[0].dtype);
-                        vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
-                            &attr[1].dtype);
-                    }
-                }
-            }
-        }
-    }
-
-    //save data
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
-        if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
-    }
-    for( i = 0; i < TENSOR_NUM; i ++ )
-    {
-        if (buffer_ptr[i]) free(buffer_ptr[i]);
-    }
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_REDUCE_list[] =
-{
-    &_VX_KERNEL_VAR,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c
@ -1,283 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_RESIZE)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_RESIZE)
-#define _VX_KERNEL_NAME         ("vsi_nn_kernel_resize")
-#define _VX_KERNEL_FUNC_KERNEL  (vxResizeKernel)
-
-static vsi_status VX_CALLBACK vxResizeKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    /* TODO: */
-#define ARG_NUM            (1)
-#define TENSOR_NUM_INPUT (1)
-#define TENSOR_NUM_OUTPUT (1)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VX_SUCCESS;
-    vx_context context = NULL;
-    vsi_nn_tensor_attr_t attr[TENSOR_NUM];
-    uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
-    vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
-    uint8_t *buffer_ptr[TENSOR_NUM] = {NULL};
-    vx_tensor tensor[TENSOR_NUM];
-
-    float factor0;
-    int32_t factor;
-    uint32_t batch, c, h, w;
-    uint32_t i, j, k, b;
-
-    //prepare data
-    context = vxGetContext((vx_reference)node);
-
-    for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
-    }
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
-    }
-
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(factor0), VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    //op calc
-    if (factor0 > 1)
-    {
-        factor = (int32_t)(factor0 + 0.5);
-        w = attr[0].size[0];
-        h = attr[0].size[1];
-        c = attr[0].size[2];
-        batch = 1;
-        for(b = 0; b < batch; ++b){
-            for(k = 0; k < c; ++k){
-                for(j = 0; j < h*factor; ++j){
-                    for(i = 0; i < w*factor; ++i){
-                        int32_t in_index = b*w*h*c + k*w*h + (j/factor)*w + i/factor;
-                        int32_t out_index = b*w*h*c*factor*factor + k*w*h*factor*factor +
-                            j*w*factor + i;
-                        float fval;
-                        //out[out_index] = in[in_index];
-                        vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index],
-                            &fval, &attr[0].dtype);
-                        vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
-                            &attr[1].dtype);
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        factor = (int32_t)(1 / factor0 + 0.5);
-        w = attr[1].size[0];
-        h = attr[1].size[1];
-        c = attr[1].size[2];
-        batch = 1;
-        for(b = 0; b < batch; ++b){
-            for(k = 0; k < c; ++k){
-                for(j = 0; j < h; ++j){
-                    for(i = 0; i < w; ++i){
-                        int32_t in_index = b*w*h*c*factor*factor +
-                            k*w*h*factor*factor + j*w*factor*factor + i*factor;
-                        int32_t out_index = b*w*h*c + k*w*h + j * w + i;
-                        float fval;
-                        //out[out_index] = in[in_index];
-                        vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], &fval,
-                            &attr[0].dtype);
-                        vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
-                            &attr[1].dtype);
-                    }
-                }
-            }
-        }
-    }
-
-    //save data
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
-        if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
-    }
-    for( i = 0; i < TENSOR_NUM; i ++ )
-    {
-        if (buffer_ptr[i]) free(buffer_ptr[i]);
-    }
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-};
-
-vsi_status VX_CALLBACK vxTensorResizeInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        2,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-    uint32_t uniPackEvenData_2x8[16] = {
-        0x33333333, // TCfg
-        0x11110000, // ASelt
-        0x06040200, 0x06040200, // ABin
-        0x00000000, // BSelt
-        0x00000000, 0x00000000, // BBin
-        0x00003400, // AccumType, ConstantType, and PostShift
-        0x00000000, 0x00000000, 0x00000000, 0x00000000,
-        0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-    };
-    vsi_status status = VX_SUCCESS;
-
-    vx_tensor input = (vx_tensor)paramObj[0];
-    uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1};
-    vsi_nn_tensor_attr_t attr;
-    uint32_t i, input_dim;
-
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status  = vsi_nn_vxGetTensorAttr(input, &attr);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-        return status;
-    }
-    input_dim  = attr.dim_num;
-    for (i = 0; i < input_dim; i++)
-    {
-        input_size[i] = attr.size[i];
-    }
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkScale[0]  = 16;
-    shaderParam.globalWorkScale[1]  = 2;
-    shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-        / shaderParam.globalWorkScale[0], 4);
-    shaderParam.globalWorkSize[1]   = (input_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1];
-
-    vxSetNodeUniform(nodObj, "uniPackEvenData_2x8", 1, uniPackEvenData_2x8);
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    return VX_SUCCESS;
-}
-
-static vx_param_description_t vxTensorResizeKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTensorResize16BitsDownSampleQuarterKernelInfo =
-{
-    VX_KERNEL_ENUM_RESIZE_16BITS_DOWNSAMPLE_QUARTER,
-    VX_KERNEL_NAME_RESIZE_16BITS_DOWNSAMPLE_QUARTER,
-    NULL,
-    vxTensorResizeKernelParam,
-    (sizeof(vxTensorResizeKernelParam) / sizeof(vxTensorResizeKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxTensorResizeInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTensorResize8BitsDownSampleQuarterKernelInfo =
-{
-    VX_KERNEL_ENUM_RESIZE_8BITS_DOWNSAMPLE_QUARTER,
-    VX_KERNEL_NAME_RESIZE_8BITS_DOWNSAMPLE_QUARTER,
-    NULL,
-    vxTensorResizeKernelParam,
-    (sizeof(vxTensorResizeKernelParam) / sizeof(vxTensorResizeKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxTensorResizeInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_RESIZE_list[] =
-{
-    &_VX_KERNEL_VAR,
-    &vxTensorResize16BitsDownSampleQuarterKernelInfo,
-    &vxTensorResize8BitsDownSampleQuarterKernelInfo,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c
@ -1,317 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_ROI_ALIGN)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_ROI_ALIGN)
-#define _VX_KERNEL_NAME         (VX_KERNEL_NAME_ROI_ALIGN)
-#define _VX_KERNEL_FUNC_KERNEL  (vxRoi_alignKernel)
-
-static vsi_status VX_CALLBACK vxRoi_alignKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-#define ARG_NUM            (6)
-#define TENSOR_NUM_INPUT (3)
-#define TENSOR_NUM_OUTPUT (1)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VSI_FAILURE;
-    vx_context context = NULL;
-    vx_tensor input[TENSOR_NUM_INPUT] = {0};
-    vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
-    float *f32_in_buffer[TENSOR_NUM_INPUT] = {0};
-    int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0};
-    float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
-    vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT];
-    vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
-    uint32_t in_elements[TENSOR_NUM_INPUT] = {0};
-    uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
-
-    int32_t output_height;
-    int32_t output_width;
-    float height_ratio;
-    float width_ratio;
-    int32_t height_sample_num;
-    int32_t width_sample_num;
-
-    uint32_t i = 0;
-    for(i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    /* prepare data */
-    context = vxGetContext((vx_reference)node);
-
-    for(i = 0; i < TENSOR_NUM_INPUT; i ++)
-    {
-        input[i] = (vx_tensor)paramObj[i];
-        status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]);
-        if (i == 2)
-        {
-            int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context,
-                input[i], &in_attr[i]);
-        }
-        else
-        {
-            f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float));
-            status = vsi_nn_vxConvertTensorToFloat32Data(
-                context, input[i], &in_attr[i], f32_in_buffer[i],
-                in_elements[i] * sizeof(float));
-            TEST_CHECK_STATUS(status, final);
-        }
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i ++)
-    {
-        output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
-        status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
-        f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float));
-        memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float));
-    }
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(output_height),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(output_width),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(height_ratio),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(width_ratio),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(height_sample_num),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(width_sample_num),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    /* TODO: Add CPU kernel implement */
-    {
-        uint32_t n, j, k;
-        uint32_t kRoiDim = 4;
-        float heightScale = 1.0f / height_ratio;
-        float widthScale = 1.0f / width_ratio;
-        uint32_t inHeight = in_attr[0].size[2];
-        uint32_t inWidth = in_attr[0].size[1];
-        uint32_t inDepth = in_attr[0].size[0];
-        uint32_t numRois = in_attr[1].size[1];
-        uint32_t outHeight = out_attr[0].size[2];
-        uint32_t outWidth = out_attr[0].size[1];
-        uint32_t out_index = 0;
-
-        for(n = 0; n < numRois; n++)
-        {
-            uint32_t batchId = int32_in_buffer[2][n];
-            float scale = (in_attr[1].dtype.vx_type == VSI_NN_TYPE_UINT16) ? 0.125f : 1.0f;
-            float wRoiStart = f32_in_buffer[1][n * kRoiDim] * widthScale * scale;
-            float hRoiStart = f32_in_buffer[1][n * kRoiDim + 1] * heightScale * scale;
-            float wRoiEnd = f32_in_buffer[1][n * kRoiDim + 2] * widthScale * scale;
-            float hRoiEnd = f32_in_buffer[1][n * kRoiDim + 3] * heightScale * scale;
-
-            float roiWidth = vsi_nn_max((wRoiEnd - wRoiStart), 1.0f);
-            float roiHeight = vsi_nn_max((hRoiEnd - hRoiStart), 1.0f);
-            float wStepSize = roiWidth / outWidth;
-            float hStepSize = roiHeight / outHeight;
-
-            uint32_t wSamplingRatio = width_sample_num > 0
-                ? width_sample_num : (uint32_t)ceil(wStepSize);
-            uint32_t hSamplingRatio = height_sample_num > 0
-                ? height_sample_num : (uint32_t)ceil(hStepSize);
-            int32_t numSamplingPoints = wSamplingRatio * hSamplingRatio;
-            float wBinSize = wStepSize / (float)(wSamplingRatio);
-            float hBinSize = hStepSize / (float)(hSamplingRatio);
-
-            int32_t batch_base_index = batchId * inHeight * inWidth * inDepth;
-
-            for (i = 0; i < outHeight; i++)
-            {
-                for (j = 0; j < outWidth; j++)
-                {
-                    float wStart = wStepSize * j + wRoiStart;
-                    float wEnd = wStepSize * (j + 1) + wRoiStart;
-                    float hStart = hStepSize * i + hRoiStart;
-                    float hEnd = hStepSize * (i + 1) + hRoiStart;
-
-                    float x,y;
-                    for (y = hStart + hBinSize / 2; y < hEnd; y += hBinSize)
-                    {
-                        for (x = wStart + wBinSize / 2; x < wEnd; x += wBinSize)
-                        {
-                            uint32_t x1 = (uint32_t)floor(x);
-                            uint32_t y1 = (uint32_t)floor(y);
-                            uint32_t x2 = x1 + 1, y2 = y1 + 1;
-                            float dx1 = x - (float)(x1);
-                            float dy1 = y - (float)(y1);
-                            if (x1 >= inWidth - 1) {
-                                x1 = x2 = inWidth - 1;
-                                dx1 = 0;
-                            }
-                            if (y1 >= inHeight - 1) {
-                                y1 = y2 = inHeight - 1;
-                                dy1 = 0;
-                            }
-                            {
-                                float dx2 = 1.0f - dx1, dy2 = 1.0f - dy1;
-                                float ws[] = {dx2 * dy2, dx1 * dy2,
-                                    dx2 * dy1, dx1 * dy1};
-                                uint32_t offsets[] = {y1 * inWidth * inDepth + x1 * inDepth,
-                                    y1 * inWidth * inDepth + x2 * inDepth,
-                                    y2 * inWidth * inDepth + x1 * inDepth,
-                                    y2 * inWidth * inDepth + x2 * inDepth};
-                                for (k = 0; k < inDepth; k++) {
-                                    float interpolation = 0;
-                                    uint32_t c;
-                                    for (c = 0; c < 4; c++)
-                                    {
-                                        interpolation += ws[c]
-                                        * f32_in_buffer[0][batch_base_index + offsets[c] + k];
-                                    }
-                                    f32_out_buffer[0][out_index + k] += interpolation;
-                                }
-                            }
-                        }
-                    }
-                    for (k = 0; k < inDepth; k++)
-                    {
-                        f32_out_buffer[0][out_index + k] /= (float)(numSamplingPoints);
-                    }
-                    out_index += inDepth;
-                }
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        status = vsi_nn_vxConvertFloat32DataToTensor(
-            context, output[i], &out_attr[i], f32_out_buffer[i],
-            out_elements[i] * sizeof(float));
-        TEST_CHECK_STATUS(status, final);
-    }
-
-final:
-    for (i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        if (f32_in_buffer[i]) free(f32_in_buffer[i]);
-        if (int32_in_buffer[i]) free(int32_in_buffer[i]);
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        if (f32_out_buffer[i]) free(f32_out_buffer[i]);
-    }
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t vxRoi_alignKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-vx_status VX_CALLBACK vxRoi_alignInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-    vx_status status = VX_SUCCESS;
-    /*TODO: Add initial code for VX program*/
-
-    return status;
-}
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxRoi_align_CPU =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    vxRoi_alignKernelParam,
-    _cnt_of_array( vxRoi_alignKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxRoi_align_VX =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    NULL,
-    vxRoi_alignKernelParam,
-    _cnt_of_array( vxRoi_alignKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxRoi_alignInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_ROI_ALIGN_list[] =
-{
-    &vxRoi_align_CPU,
-    &vxRoi_align_VX,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c
@ -1,410 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_SCALE)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_SCALE)
-#define _VX_KERNEL_NAME         ("vsi_nn_kernel_scale")
-#define _VX_KERNEL_FUNC_KERNEL  (vxScaleKernel)
-
-static vsi_status VX_CALLBACK vxScaleKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if( 6 == paramNum )
-    {
-        vx_context  context = NULL;
-        vx_tensor   input_tensor = NULL;
-        vx_tensor   scale_tensor = NULL;
-        vx_tensor   bias_tensor = NULL;
-        vx_tensor   output_tensor = NULL;
-        uint8_t *  input_buffer = NULL;
-        uint8_t *  scale_buffer = NULL;
-        uint8_t *  bias_buffer = NULL;
-        uint8_t *  output_buffer = NULL;
-        vx_scalar   axis_scalar = NULL;
-        vx_scalar   has_bias_scalar = NULL;
-        int         axis = 1;
-        float  has_bias = 0;
-        uint32_t   input_dims = 0;
-        uint32_t   scale_dims = 0;
-        uint32_t   bias_dims = 0;
-        uint32_t   output_dims = 0;
-        vsi_enum     inputFormat = VSI_NN_TYPE_FLOAT16;
-        vsi_enum     scaleFormat = VSI_NN_TYPE_FLOAT16;
-        vsi_enum     biasFormat = VSI_NN_TYPE_FLOAT32;
-        vsi_enum     outputFormat = VSI_NN_TYPE_FLOAT16;
-        uint32_t   input_size[4] = {1, 1, 1, 1};
-        uint32_t   scale_size[4] = {1, 1, 1, 1};
-        uint32_t   bias_size[4] = {1, 1, 1, 1};
-        uint32_t   output_size[4] = {1, 1, 1, 1};
-        uint32_t   input_stride_size[VSI_NN_MAX_DIM_NUM] = { 0 };
-        uint32_t   output_stride_size[VSI_NN_MAX_DIM_NUM] = { 0 };
-        vx_tensor_addressing input_user_addr = NULL;
-        vx_tensor_addressing scale_user_addr = NULL;
-        vx_tensor_addressing bias_user_addr = NULL;
-        vx_tensor_addressing output_user_addr = NULL;
-        vsi_nn_tensor_attr_t out_attr;
-
-        status = VX_SUCCESS;
-
-        memset(&out_attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
-
-        input_tensor = (vx_tensor)paramObj[0];
-        scale_tensor = (vx_tensor)paramObj[1];
-        bias_tensor = (vx_tensor)paramObj[2];
-        output_tensor = (vx_tensor)paramObj[3];
-        axis_scalar = (vx_scalar)paramObj[4];
-        has_bias_scalar = (vx_scalar)paramObj[5];
-
-        context = vxGetContext((vx_reference)node);
-        if( NULL == context)
-        {
-            VSILOGE("vxGetContext failure!\n");
-            status = VX_FAILURE;
-            goto OnError;
-        }
-
-        input_buffer = vsi_nn_ConvertRawTensorToData(context, input_tensor,
-            &input_dims, &inputFormat, input_size, input_stride_size,
-            &input_user_addr, VX_READ_ONLY);
-        if( NULL == input_buffer )
-        {
-            VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
-            status = VX_ERROR_NO_MEMORY;
-            goto OnError;
-        }
-
-        scale_buffer = vsi_nn_ConvertRawTensorToData(context, scale_tensor,
-            &scale_dims, &scaleFormat, scale_size, input_stride_size,
-            &scale_user_addr, VX_READ_ONLY);
-        if( NULL == scale_buffer )
-        {
-            VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
-            status = VX_ERROR_NO_MEMORY;
-            goto OnError;
-        }
-
-        bias_buffer = vsi_nn_ConvertRawTensorToData(context, bias_tensor,
-            &bias_dims, &biasFormat, bias_size, input_stride_size,
-            &bias_user_addr, VX_READ_ONLY);
-        if( NULL == bias_buffer )
-        {
-            VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
-            status = VX_ERROR_NO_MEMORY;
-            goto OnError;
-        }
-
-        output_buffer = vsi_nn_ConvertRawTensorToData(context, output_tensor,
-            &output_dims, &outputFormat, output_size, output_stride_size,
-            &output_user_addr, VX_WRITE_ONLY);
-        if( NULL == output_buffer )
-        {
-            VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
-            status = VX_ERROR_NO_MEMORY;
-            goto OnError;
-        }
-
-        status = vsi_nn_vxGetTensorAttr(output_tensor, &out_attr);
-        if (status != VX_SUCCESS)
-        {
-            VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-
-        status = vxCopyScalar(axis_scalar, &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        if( VX_SUCCESS != status)
-        {
-            VSILOGE("vxCopyScalar axis failure! status:%d\n", status);
-            goto OnError;
-        }
-        status = vxCopyScalar(has_bias_scalar, &has_bias, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        if( VX_SUCCESS != status )
-        {
-            VSILOGE("vxCopyScalar axis failure! has_bias:%f\n", has_bias);
-            goto OnError;
-        }
-
-        if( input_dims != output_dims )
-        {
-            VSILOGE("Invalid parameters, input_dims output_dims mismatch %d:%d\n",
-                input_dims, output_dims);
-            status = VX_ERROR_INVALID_PARAMETERS;
-            goto OnError;
-        }
-        if( input_size[0] != scale_size[0] || input_size[0] != bias_size[0] )
-        {
-            VSILOGE("Invalid parameters, input size mismatch %d:%d:%d\n",
-                input_size[0], scale_size[0], bias_size[0]);
-            status = VX_ERROR_INVALID_PARAMETERS;
-            goto OnError;
-        }
-        {
-            uint32_t i = 0;
-            uint32_t j = 0;
-            uint32_t fixed_num = 1;
-            uint32_t changed_num = 1;
-
-            fixed_num = input_size[1] * input_size[2] * input_size[3];
-            changed_num = input_size[0];
-
-            for( i = 0; i < fixed_num; i++ )
-            {
-                int16_t* cur_input_row_ofst = ((int16_t *)input_buffer) + i * changed_num;
-                int16_t* cur_scale_row_ofst = ((int16_t *)scale_buffer);
-                float* cur_bias_row_ofst = ((float *)bias_buffer);
-                int16_t* cur_output_row_ofst = ((int16_t *)output_buffer) + i * changed_num;
-
-                for( j = 0; j < changed_num; j++ )
-                {
-                    float cur_input_v = vsi_nn_Fp16ToFp32(*(cur_input_row_ofst + j));
-                    float cur_scale_v = vsi_nn_Fp16ToFp32(*(cur_scale_row_ofst + j));
-                    float cur_bias_v = *(cur_bias_row_ofst + j);
-
-                    float cur_result = cur_input_v * cur_scale_v + cur_bias_v;
-                    *(cur_output_row_ofst + j) = vsi_nn_Fp32ToFp16(cur_result);
-                }
-            }
-
-#if defined(_SAVE_TENSOR)
-            {
-                static int count = 0;
-                char fname[256] = { 0 };
-                sprintf(fname, "scale_output_tensor.%d.axis.%d.txt", count, axis);
-                vsi_nn_SaveDataToText(fname, output_buffer,
-                    vsi_nn_ShapeProduct(output_size, output_dims), VSI_NN_TYPE_FLOAT16, NULL);
-                count++;
-            }
-#endif
-        }
-        status = vsi_nn_vxCopyDataToTensor(context, output_tensor, &out_attr, output_buffer);
-        TEST_CHECK_STATUS(status, OnError);
-OnError:
-        if( NULL != input_buffer )
-        {
-            free( input_buffer );
-            input_buffer = NULL;
-        }
-        if( NULL != scale_buffer )
-        {
-            free( scale_buffer );
-            scale_buffer = NULL;
-        }
-        if( NULL != bias_buffer )
-        {
-            free( bias_buffer );
-            bias_buffer = NULL;
-        }
-        if( NULL != output_buffer )
-        {
-            free( output_buffer );
-            output_buffer = NULL;
-        }
-
-        if (input_user_addr)
-        {
-            vxReleaseTensorAddressing(&input_user_addr);
-        }
-        if (scale_user_addr)
-        {
-            vxReleaseTensorAddressing(&scale_user_addr);
-        }
-        if (bias_user_addr)
-        {
-            vxReleaseTensorAddressing(&bias_user_addr);
-        }
-        if (output_user_addr)
-        {
-            vxReleaseTensorAddressing(&output_user_addr);
-        }
-
-    }
-
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-};
-
-vsi_status VX_CALLBACK vxScaleInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        2,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-    uint32_t uniExtractHalf8_2x8[16] = {
-        0x11111111, // TCfg
-        0x11110000, // ASelt
-        0x06040200, 0x06040200, // ABin
-        0x22222222, // BSelt
-        0x00000000, 0x00000000, // BBin
-        0x00002100, // AccumType, ConstantType, and PostShift
-        0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
-        0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
-    };
-    uint32_t uniFp16MulFp16ToFp32_Lo_4x4[16] = {
-        0x01010101, // TCfg
-        0x00000000, // ASelt
-        0x00010000, 0x00030002, // ABin
-        0x01010101, // BSelt
-        0x00010000, 0x00030002, // BBin
-        0x00000400, // AccumType, ConstantType, and PostShift
-        0x00000000, 0x00000000, 0x00000000, 0x00000000,
-        0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-    };
-    uint32_t uniFp16MulFp16ToFp32_Hi_4x4[16] = {
-        0x01010101, // TCfg
-        0x00000000, // ASelt
-        0x00050004, 0x00070006, // ABin
-        0x01010101, // BSelt
-        0x00050004, 0x00070006, // BBin
-        0x00000400, // AccumType, ConstantType, and PostShift
-        0x00000000, 0x00000000, 0x00000000, 0x00000000,
-        0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-    };
-
-    vsi_status status = VX_SUCCESS;
-
-    vx_tensor input     = (vx_tensor)paramObj[0];
-    uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1};
-    vx_uint32 i = 0;
-    vsi_nn_tensor_attr_t attr;
-
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status = vsi_nn_vxGetTensorAttr(input, &attr);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-    for (i = 0; i < attr.dim_num; i++)
-    {
-        input_size[i] = attr.size[i];
-    }
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkScale[0]  = 8;
-    shaderParam.globalWorkScale[1]  = 1;
-    shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-        / shaderParam.globalWorkScale[0], 4);
-    shaderParam.globalWorkSize[1]   = (input_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1];
-
-    vxSetNodeUniform(nodObj, "uniExtractHalf8_2x8", 1, uniExtractHalf8_2x8);
-    vxSetNodeUniform(nodObj, "uniFp16MulFp16ToFp32_Lo_4x4", 1, uniFp16MulFp16ToFp32_Lo_4x4);
-    vxSetNodeUniform(nodObj, "uniFp16MulFp16ToFp32_Hi_4x4", 1, uniFp16MulFp16ToFp32_Hi_4x4);
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    return VX_SUCCESS;
-}
-
-static vx_param_description_t vxScaleKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxScaleKernelInfo =
-{
-    VX_KERNEL_ENUM_SCALE,
-    VX_KERNEL_NAME_SCALE_FP16,
-    NULL,
-    vxScaleKernelParam,
-    (sizeof(vxScaleKernelParam) / sizeof(vxScaleKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxScaleInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_SCALE_list[] =
-{
-    &_VX_KERNEL_VAR,
-    &vxScaleKernelInfo,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
-
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c
@ -1,345 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-vsi_status vxShuffleChannelFunc
-    (
-    vx_context context,
-    vx_tensor input,
-    vx_tensor output,
-    int32_t group_number,
-    int32_t axis
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_tensor_attr_t input_attr;
-    vsi_nn_tensor_attr_t output_attr;
-    uint8_t *in_data = NULL;
-    uint8_t *out_data = NULL;
-    uint32_t stride_size[VSI_NN_MAX_DIM_NUM] = {0};
-    uint32_t buf_sz = 0;
-    uint32_t group_row = group_number;
-    uint32_t chs = 0, group_col = 0;
-    uint32_t len = 1, num = 1, feature_map_size = 1;
-    uint32_t n = 0, i = 0, j = 0;
-    uint32_t type_bytes = 0, len_bytes = 0, fms_bytes = 0;
-
-    status  = vsi_nn_vxGetTensorAttr(input, &input_attr);
-    status |= vsi_nn_vxGetTensorAttr(output, &output_attr);
-    TEST_CHECK_STATUS(status, final);
-    in_data = vsi_nn_vxCopyTensorToData(context, input, &input_attr);
-    TEST_CHECK_PTR(in_data, final);
-    buf_sz = vsi_nn_GetStrideSize(&output_attr, stride_size);
-    out_data = (uint8_t *)malloc( buf_sz );
-    TEST_CHECK_PTR(out_data, final);
-
-    chs = input_attr.size[axis];
-    group_col = chs / group_row;
-    type_bytes = vsi_nn_TypeGetBytes( input_attr.dtype.vx_type );
-
-    for ( i = 0; i < (uint32_t)axis; i++)
-    {
-        len *= input_attr.size[i];
-    }
-    for ( i = axis + 1; i < input_attr.dim_num; i++)
-    {
-        num *= input_attr.size[i];
-    }
-    for ( i = 0; i <= (uint32_t)axis; i++)
-    {
-        feature_map_size *= input_attr.size[i];
-    }
-
-    /* Shuffle Channel CPU Implement, the shape and dtype of output must same as input */
-    len_bytes = len * type_bytes;
-    fms_bytes = feature_map_size * type_bytes;
-    for ( n = 0; n < num; n++)
-    {
-        for ( i = 0; i < group_row; i++)
-        {
-            for ( j = 0; j < group_col; j++)
-            {
-                uint8_t *in_ptr = in_data + n * fms_bytes + (i * group_col + j) * len_bytes;
-                uint8_t *out_ptr = out_data + n * fms_bytes + (j * group_row + i) * len_bytes;
-
-                memcpy(out_ptr, in_ptr, len_bytes);
-            }
-        }
-    }
-
-    /* Copy data to output tensor */
-    status = vsi_nn_vxCopyDataToTensor(context, output, &output_attr, out_data);
-    TEST_CHECK_STATUS(status, final);
-final:
-    if (in_data) free(in_data);
-    if (out_data) free(out_data);
-    return status;
-}
-vsi_status VX_CALLBACK vxShuffleChannelKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if(paramNum == 4)
-    {
-        vx_context context = NULL;
-        // tensor
-        vx_tensor imgObj[2] = { NULL };
-        // scalar
-        vx_scalar scalar[2] = { NULL };
-        int32_t group_number = 0;
-        int32_t axis = 0;
-
-        imgObj[0] = (vx_tensor)paramObj[0];
-        imgObj[1] = (vx_tensor)paramObj[1];
-        scalar[0] = (vx_scalar)paramObj[2];
-        scalar[1] = (vx_scalar)paramObj[3];
-
-        context = vxGetContext((vx_reference)node);
-        TEST_CHECK_PTR(context,final);
-        // scalar
-        status = vxCopyScalar(scalar[0], &group_number, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        TEST_CHECK_STATUS(status, final);
-        status = vxCopyScalar(scalar[1], &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        TEST_CHECK_STATUS(status, final);
-
-        // Call C Prototype
-        status = vxShuffleChannelFunc(context, imgObj[0], imgObj[1], group_number, axis);
-        TEST_CHECK_STATUS(status, final);
-    }
-final:
-    return status;
-}
-vsi_status VX_CALLBACK vxShuffleChannelInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        3,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-
-    vx_tensor     input           = (vx_tensor)paramObj[0];
-    vx_scalar     group_numbers   = (vx_scalar)paramObj[2];
-    vx_scalar     axis_s          = (vx_scalar)paramObj[3];
-    uint32_t      input_size[4]   = {1, 1, 1, 1};
-    vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16;
-    int32_t       group_number    = 0;
-    int32_t       axis            = 0;
-    int32_t       group_column    = 0;
-    float         rgroup_column   = 0.0f;
-    uint32_t      chs             = 0;
-    vx_uint32     i               = 0;
-    vsi_nn_tensor_attr_t attr;
-
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status = vsi_nn_vxGetTensorAttr(input, &attr);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-    for (i = 0; i < attr.dim_num; i++)
-    {
-        input_size[i] = attr.size[i];
-    }
-    inputDataFormat = attr.dtype.vx_type;
-
-    status |= vxCopyScalar(group_numbers, &group_number, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    status |= vxCopyScalar(axis_s, &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    if(VX_SUCCESS != status)
-    {
-        VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
-        return status;
-    }
-    chs = input_size[axis];
-    if (chs % group_number)
-    {
-        VSILOGE("input channel can't be exact divided by group number! at line %d\n", __LINE__);
-        return VX_FAILURE;
-    }
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkOffset[2] = 0;
-    if (axis == 2)
-    {
-        if (inputDataFormat == VSI_NN_TYPE_FLOAT16 || inputDataFormat == VSI_NN_TYPE_INT16)
-            shaderParam.globalWorkScale[0]  = 8;
-        else
-            shaderParam.globalWorkScale[0]  = 16;
-        shaderParam.globalWorkScale[1]  = 4;
-        shaderParam.globalWorkScale[2]  = 1;
-
-        shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-            / shaderParam.globalWorkScale[0], 4);
-        shaderParam.globalWorkSize[1]   = (input_size[1] + shaderParam.globalWorkScale[1] - 1)
-            / shaderParam.globalWorkScale[1];
-        shaderParam.globalWorkSize[2]   = input_size[2];
-    }
-    else if (axis == 1)
-    {
-        shaderParam.globalWorkScale[0]  = 32;
-        shaderParam.globalWorkScale[1]  = 1;
-        shaderParam.globalWorkScale[2]  = 1;
-
-        shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-            / shaderParam.globalWorkScale[0], 4);
-        shaderParam.globalWorkSize[1]   = input_size[1];
-        shaderParam.globalWorkSize[2]   = input_size[2];
-    }
-    else
-    {
-        VSILOGE("[%s : %d]Initializer failure, not support axis: %d! \n",__FILE__, __LINE__, axis);
-        return VX_FAILURE;
-    }
-    group_column = chs / group_number;
-    rgroup_column = 1.0f / group_column;
-
-    status |= vxSetNodeUniform(nodObj, "group_column", 1, &group_column);
-    status |= vxSetNodeUniform(nodObj, "rgroup_column", 1, &rgroup_column);
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    if(status < 0)
-    {
-        VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
-    }
-    return status;
-}
-static vx_param_description_t vxShuffleChannelKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxShuffleChannelKernelInfo =
-{
-    VX_KERNEL_ENUM_SHUFFLECHANNEL,
-    VX_KERNEL_NAME_SHUFFLECHANNEL,
-    NULL,
-    vxShuffleChannelKernelParam,
-    (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxShuffleChannelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-vx_kernel_description_t vxShuffleChannelKernelInfo8Bits =
-{
-    VX_KERNEL_ENUM_SHUFFLECHANNEL,
-    VX_KERNEL_NAME_SHUFFLECHANNEL8BITS,
-    NULL,
-    vxShuffleChannelKernelParam,
-    (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxShuffleChannelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-vx_kernel_description_t vxShuffleChannelKernelInfo_CPU =
-{
-    VX_KERNEL_ENUM_SHUFFLECHANNEL,
-    VX_KERNEL_NAME_SHUFFLECHANNEL,
-    vxShuffleChannelKernel,
-    vxShuffleChannelKernelParam,
-    (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-vx_kernel_description_t vxShuffleChannelKernelInfo_16BitsAxis1 =
-{
-    VX_KERNEL_ENUM_SHUFFLECHANNEL,
-    VX_KERNEL_NAME_SHUFFLECHANNEL16BITS_AXIS1,
-    NULL,
-    vxShuffleChannelKernelParam,
-    (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxShuffleChannelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-vx_kernel_description_t vxShuffleChannelKernelInfo_8BitsAxis1 =
-{
-    VX_KERNEL_ENUM_SHUFFLECHANNEL,
-    VX_KERNEL_NAME_SHUFFLECHANNEL8BITS_AXIS1,
-    NULL,
-    vxShuffleChannelKernelParam,
-    (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxShuffleChannelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-vx_kernel_description_t * vx_kernel_SHUFFLECHANNEL_list[] =
-{
-    &vxShuffleChannelKernelInfo_CPU,
-    &vxShuffleChannelKernelInfo,
-    &vxShuffleChannelKernelInfo8Bits,
-    &vxShuffleChannelKernelInfo_16BitsAxis1,
-    &vxShuffleChannelKernelInfo_8BitsAxis1,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c
@ -1,293 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_SPACE2DEPTH)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_SPACE2DEPTH)
-#define _VX_KERNEL_NAME         ("vsi_nn_kernel_space2depth")
-#define _VX_KERNEL_FUNC_KERNEL  (vxSpace2DepthKernel)
-
-static vsi_status VX_CALLBACK vxSpace2DepthKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    /* TODO: */
-#define ARG_NUM          (2)
-#define TENSOR_NUM_INPUT (1)
-#define TENSOR_NUM_OUTPUT (1)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VX_SUCCESS;
-    uint32_t  i = 0;
-    vx_context context = NULL;
-    vsi_nn_tensor_attr_t attr[TENSOR_NUM];
-    uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
-    vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
-    uint8_t *buffer_ptr[TENSOR_NUM] = {NULL};
-    vx_tensor tensor[TENSOR_NUM] = {NULL};
-
-    int32_t block_size_x = 0, block_size_y = 0;
-    int32_t output_depth = 0, output_height = 0, output_width = 0;
-    int32_t input_batch = 0, input_depth = 0, input_height = 0, input_width = 0;
-    int32_t batch = 0, dim = 0;
-
-    for(i = 0; i < TENSOR_NUM; i++)
-    {
-        memset(&attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-
-    //prepare data
-    context = vxGetContext((vx_reference)node);
-
-    for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
-    }
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
-    }
-
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(block_size_x),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(block_size_y),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    dim = attr[0].dim_num;
-    if(dim < 4)
-        attr[0].size[3] = 1;
-    //op calc
-    //output_batch = attr[1].size[3];
-    output_depth = attr[1].size[2];
-    output_height = attr[1].size[1];
-    output_width = attr[1].size[0];
-
-    input_batch = attr[0].size[3];
-    input_depth = attr[0].size[2];
-    input_height = attr[0].size[1];
-    input_width = attr[0].size[0];
-
-    for (batch = 0; batch < input_batch; ++batch)
-    {
-        vx_uint32 output_batch_index = batch * output_height * output_width * output_depth;
-        vx_uint32 input_batch_index = batch * input_height * input_width * input_depth;
-        vx_uint32 in_d;
-        for (in_d = 0; in_d < (vx_uint32)input_depth; in_d ++)
-        {
-            vx_uint32 in_h;
-            for (in_h = 0; in_h < (vx_uint32)input_height; ++ in_h)
-            {
-                vx_uint32 in_w;
-                for (in_w = 0; in_w < (vx_uint32)input_width; in_w ++)
-                {
-                    vx_int32 out_w = in_w / block_size_x;
-                    vx_int32 out_h = in_h / block_size_y;
-                    //vx_int32 out_d = (in_w  % block_size_x) * input_depth + (in_h % block_size_y) * block_size_x * input_depth + in_d;
-                    vx_int32 out_d = (in_w  % block_size_x) + (in_h % block_size_y) * block_size_x + in_d * block_size_x * block_size_y;
-
-                    vx_int32 in_index = in_w + in_h * input_width +in_d * input_height * input_width + input_batch_index;
-                    vx_int32 out_index = out_w + out_h * output_width +  out_d * output_width * output_height + output_batch_index;
-
-                    //outputBase[out_index] = inputBase[in_index];
-                    float fval;
-                    vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index],
-                        &fval, &attr[0].dtype);
-                    vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
-                        &attr[1].dtype);
-                }
-            }
-        }
-    }
-
-    //save data
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
-    }
-    for( i = 0; i < TENSOR_NUM; i ++ )
-    {
-        if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
-        if (buffer_ptr[i]) free(buffer_ptr[i]);
-    }
-
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-vsi_status VX_CALLBACK vxSpace2DepthInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        3,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-
-    vsi_status status = VX_SUCCESS;
-
-    vx_tensor input     = (vx_tensor)paramObj[0];
-    uint32_t input_size[4] = {1, 1, 1, 1};
-    vx_uint32 input_dimz = 0;
-    vx_uint32 input_depth = 0;
-    vx_uint32 i = 0;
-    vsi_nn_tensor_attr_t attr;
-
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status = vsi_nn_vxGetTensorAttr(input, &attr);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-    for (i = 0; i < attr.dim_num; i++)
-    {
-        input_size[i] = attr.size[i];
-    }
-
-    input_depth = input_size[2];
-    if(input_size[3] > 0)
-        input_dimz = input_depth * input_size[3];
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkOffset[2] = 0;
-    shaderParam.globalWorkScale[0]  = 8;
-    shaderParam.globalWorkScale[1]  = 1;
-    shaderParam.globalWorkScale[2]  = 1;
-    shaderParam.localWorkSize[0]    = 8;
-    shaderParam.localWorkSize[1]    = 1;
-    shaderParam.localWorkSize[2]    = 1;
-    shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-        / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
-    shaderParam.globalWorkSize[1]   = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
-    shaderParam.globalWorkSize[2]   = input_dimz;
-
-    {
-        vx_uint32 uniExtractEvenFp16Stride2_4x4[16] = {
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00020000, 0x00060004, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        };
-        vx_uint32 uniExtractOddFp16Stride2_4x4[16] = {
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00030001, 0x00070005, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        };
-        status |= vxSetNodeUniform(nodObj, "uniExtractEvenFp16Stride2_4x4", 1, uniExtractEvenFp16Stride2_4x4);
-        status |= vxSetNodeUniform(nodObj, "uniExtractOddFp16Stride2_4x4", 1, uniExtractOddFp16Stride2_4x4);
-        //status |= vxSetNodeUniform(nodObj, "input_depth", 1, &input_depth);
-    }
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    return VX_SUCCESS;
-}
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxSpace2DepthKernelInfo_int16_int16 =
-{
-    _VX_KERNEL_ID,
-    VX_KERNEL_NAME_SPACE2DEPTH_INT16_INT16,
-    NULL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxSpace2DepthInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_SPACE2DEPTH_list[] =
-{
-    NULL,
-    &_VX_KERNEL_VAR,
-    &vxSpace2DepthKernelInfo_int16_int16,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
--- a/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx
@ -54,3 +54,81 @@ __kernel void a_times_b_plus_c_F16_F16_F16toF16_2D
    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }

+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;
+_viv_uniform VXC_512Bits uniA_Times_B_lo_4x4;
+_viv_uniform VXC_512Bits uniA_Times_B_hi_4x4;
+__kernel void a_times_b_plus_c_F16_F16_F32toF16
+    (
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __read_only image2d_array_t   input2,
+    __write_only image2d_array_t  output
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_half8   src0, src1, dst;
+    vxc_ushort8 vec0, vec1, result;
+    float4 b0, b1;
+    float4 dst0, dst1;
+
+    VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src0, vec0, 16);
+    VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src1, vec1, 16);
+    b0 = read_imagef(input2, coord);
+    coord.x += 4;
+    b1 = read_imagef(input2, coord);
+    coord.x -= 4;
+
+    VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4);
+    VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4);
+    dst0 += b0;
+    dst1 += b1;
+
+    half4 t0, t1;
+    _viv_asm(CONV, t0, dst0);
+    _viv_asm(CONV, t1, dst1);
+    VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);
+    _viv_asm(COPY, result, dst, 16);
+
+    VXC_WriteImage2DArray(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void a_times_b_plus_c_F16_F16_F32toF16_2D
+    (
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __read_only image2d_t         input2,
+    __write_only image2d_array_t  output
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    vxc_half8   src0, src1, dst;
+    vxc_ushort8 vec0, vec1, result;
+    float4 b0, b1;
+    float4 dst0, dst1;
+
+    VXC_ReadImage(vec0, input0, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src0, vec0, 16);
+    VXC_ReadImage(vec1, input1, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src1, vec1, 16);
+    b0 = read_imagef(input2, coord.xy);
+    coord.z = coord.x + 4;
+    b1 = read_imagef(input2, coord.zy);
+
+    VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4);
+    VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4);
+    dst0 += b0;
+    dst1 += b1;
+
+    half4 t0, t1;
+    _viv_asm(CONV, t0, dst0);
+    _viv_asm(CONV, t1, dst1);
+    VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);
+    _viv_asm(COPY, result, dst, 16);
+
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
@ -1,10 +1,11 @@
 #include "cl_viv_vx_ext.h"

 _viv_uniform int indices_num;
+_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;

 __kernel void gather_I8toI8(
    __read_only image2d_t   input0,
-    __read_only image2d_array_t   input1,
+    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int block_num,
@ -16,7 +17,7 @@ __kernel void gather_I8toI8(
    int gidz = get_global_id(2);  // block_num

    int4 coord_in = (int4)(gidy, 0, gidx, 0);
-    int4 indice = read_imagei(input1, coord_in.xyyy);
+    int4 indice = read_imagei(input1, coord_in.xy);
    coord_in.w = gidz * axis_num + indice.x;

    vxc_char16 src;
@ -28,7 +29,7 @@ __kernel void gather_I8toI8(

 __kernel void gather_U8toU8(
    __read_only image2d_t   input0,
-    __read_only image2d_array_t   input1,
+    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int block_num,
@ -40,7 +41,7 @@ __kernel void gather_U8toU8(
    int gidz = get_global_id(2);  // block_num

    int4 coord_in = (int4)(gidy, 0, gidx, 0);
-    int4 indice = read_imagei(input1, coord_in.xyyy);
+    int4 indice = read_imagei(input1, coord_in.xy);
    coord_in.w = gidz * axis_num + indice.x;

    vxc_uchar16 src;
@ -52,7 +53,7 @@ __kernel void gather_U8toU8(

 __kernel void gather_I16toI16(
    __read_only image2d_t   input0,
-    __read_only image2d_array_t   input1,
+    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int block_num,
@ -66,7 +67,7 @@ __kernel void gather_I16toI16(
    int4 coord_in = (int4)(gidy, 0, gidx, 0);


-    int4 indice = read_imagei(input1, coord_in.xyyy);
+    int4 indice = read_imagei(input1, coord_in.xy);
    coord_in.w = gidz * axis_num + indice.x;

    vxc_short8 src;
@ -78,7 +79,7 @@ __kernel void gather_I16toI16(

 __kernel void gather_F16toF16(
    __read_only image2d_t   input0,
-    __read_only image2d_array_t   input1,
+    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int block_num,
@ -92,7 +93,7 @@ __kernel void gather_F16toF16(
    int4 coord_in = (int4)(gidy, 0, gidx, 0);


-    int4 indice = read_imagei(input1, coord_in.xyyy);
+    int4 indice = read_imagei(input1, coord_in.xy);
    coord_in.w = gidz * axis_num + indice.x;

    vxc_short8 src;
@ -101,3 +102,107 @@ __kernel void gather_F16toF16(
    int2 coord = (int2)(gidx, gidz * indices_num + gidy);
    VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void gather_I8toI8_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+    int4 indices = read_imagei(input1, coord.xx);
+    int2 coord_in = (int2)(indices.x, get_global_id(1));
+
+    vxc_char16 src, dst;
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    indices.x = get_global_id(1);
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_U8toU8_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+    int4 indices = read_imagei(input1, coord.xx);
+    int2 coord_in = (int2)(indices.x, get_global_id(1));
+
+    vxc_uchar16 src, dst;
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    indices.x = get_global_id(1);
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_I16toI16_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+    int4 indices = read_imagei(input1, coord.xx);
+    int2 coord_in = (int2)(indices.x, get_global_id(1));
+
+    vxc_short8 src, dst;
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    indices.x = get_global_id(1);
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_F16toF16_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+    int4 indices = read_imagei(input1, coord.xx);
+    int2 coord_in = (int2)(indices.x, get_global_id(1));
+
+    vxc_short8 src, dst;
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    indices.x = get_global_id(1);
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
@ -11,7 +11,7 @@ _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
 #define GATHER_8BITS_TO_F16(src0_type_name, read_type) \
 __kernel void gather_##src0_type_name##toF16( \
    __read_only image2d_t   input0, \
-    __read_only image2d_array_t   input1, \
+    __read_only image2d_t   input1, \
    __write_only image2d_t  output, \
    int block_size, \
    int block_num, \
@ -23,7 +23,7 @@ __kernel void gather_##src0_type_name##toF16( \
    int gidz = get_global_id(2); \
 \
    int4 coord_in = (int4)(gidy, 0, gidx, 0); \
-    int4 indice = read_imagei(input1, coord_in.xyyy); \
+    int4 indice = read_imagei(input1, coord_in.xy); \
    coord_in.w = gidz * axis_num + indice.x; \
 \
    read_type src; \
@ -47,7 +47,7 @@ GATHER_8BITS_TO_F16(I8, vxc_char16)
 #define GATHER_F16_TO_QINT(src1_type_name, write_type) \
 __kernel void gather_F16to##src1_type_name( \
    __read_only image2d_t   input0, \
-    __read_only image2d_array_t   input1, \
+    __read_only image2d_t   input1, \
    __write_only image2d_t  output, \
    int block_size, \
    int block_num, \
@ -59,7 +59,7 @@ __kernel void gather_F16to##src1_type_name( \
    int gidz = get_global_id(2); \
    int4 coord_in = (int4)(gidy, 0, gidx, 0); \
 \
-    int4 indice = read_imagei(input1, coord_in.xyyy); \
+    int4 indice = read_imagei(input1, coord_in.xy); \
    coord_in.w = gidz * axis_num + indice.x; \
 \
    vxc_short8 src; \
@ -79,7 +79,7 @@ GATHER_F16_TO_QINT(I16, vxc_short8)

 __kernel void gather_I16toF16(
    __read_only image2d_t   input0,
-    __read_only image2d_array_t   input1,
+    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int block_num,
@ -91,7 +91,7 @@ __kernel void gather_I16toF16(
    int gidz = get_global_id(2);

    int4 coord_in = (int4)(gidy, 0, gidx, 0);
-    int4 indice = read_imagei(input1, coord_in.xyyy);
+    int4 indice = read_imagei(input1, coord_in.xy);
    coord_in.w = gidz * axis_num + indice.x;

    vxc_short8 src;
@ -109,3 +109,97 @@ __kernel void gather_I16toF16(

    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
+
+#define GATHER_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \
+__kernel void gather_##src0_type_name##toF16_axis0( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    int4 indices = read_imagei(input1, coord.xx); \
+    int2 coord_in = (int2)(indices.x, get_global_id(1)); \
+ \
+    read_type src; \
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    indices.x = get_global_id(1); \
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    vxc_half8  src0; \
+    vxc_short8 dst0; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
+    _viv_asm(COPY, dst0, src0, 16); \
+    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_8BITS_TO_F16_AXIS0(U8, vxc_uchar16)
+GATHER_8BITS_TO_F16_AXIS0(I8, vxc_char16)
+
+#define GATHER_F16_TO_QINT_AXIS0(src1_type_name, write_type) \
+__kernel void gather_F16to##src1_type_name##_axis0( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    int4 indices = read_imagei(input1, coord.xx); \
+    int2 coord_in = (int2)(indices.x, get_global_id(1)); \
+ \
+    vxc_short8 src; \
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    indices.x = get_global_id(1); \
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    vxc_half8 data; \
+    write_type dst; \
+    _viv_asm(COPY, data, src, 16); \
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_F16_TO_QINT_AXIS0(U8, vxc_uchar16)
+GATHER_F16_TO_QINT_AXIS0(I8, vxc_char16)
+GATHER_F16_TO_QINT_AXIS0(I16, vxc_short8)
+
+__kernel void gather_I16toF16_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+    int4 indices = read_imagei(input1, coord.xx);
+    int2 coord_in = (int2)(indices.x, get_global_id(1));
+
+    vxc_short8 src;
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    indices.x = get_global_id(1);
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    vxc_half8  src0;
+    vxc_short8 dst0;
+    vxc_ushort8 ms0;
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);
+    VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniU8MulAndPostShift_0_Lo_2x8);
+    _viv_asm(COPY, dst0, src0, 16);
+
+    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx
@ -0,0 +1,279 @@
+#include "cl_viv_vx_ext.h"
+
+/**************************layernorm float16***********************************/
+_viv_uniform int width;
+_viv_uniform float dimRatio;
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;
+
+__kernel void layer_norm_F16toF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    vxc_short8 src0, src1;
+    vxc_float sum = 0, sqr = 0;
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
+    {
+        vxc_half8  val0_h;
+        _viv_asm(COPY, val0_h, src0, 16);
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+            uniFp16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr += sumsqr.y;
+    }
+    vxc_float mean;
+    mean = sum * dimRatio;
+    vxc_float vari;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4 bias_f;
+    for(coord.x = 0; coord.x < width; coord.x += 4)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        bias_f = read_imagef(bias, coord.xw);
+        vxc_half8 in_h, scale_h;
+        _viv_asm(COPY, in_h, src0, 16);
+        _viv_asm(COPY, scale_h, src1, 16);
+        vxc_float4 in_f, scale_f;
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        vxc_float4 sub, norm;
+        sub = in_f - mean;
+        norm = scale_f * vari * sub + bias_f;
+        half4 norm_h;
+        _viv_asm(CONV, norm_h, norm);
+        vxc_half8 dst;
+        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniExtractHalf4_dp4x4);
+        vxc_short8 dstval;
+        _viv_asm(COPY, dstval, dst, 16);
+        coord_out.x = coord.x;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    }
+}
+/*****************************layernorm uint8 to uint8****************************/
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniSumU8_16x1;
+_viv_uniform VXC_512Bits uniSqrSum_16x1;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+_viv_uniform int sumInZp;
+_viv_uniform int tmpZp1;
+_viv_uniform int tmpZp2;
+_viv_uniform float e2InScale;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+__kernel void layer_norm_U8toU8(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    vxc_uchar16 src0, src2;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    float sum = 0, sqr = 0;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    int tmpSum = 0, tmpSqr = 0;
+    vxc_int4 tmpSum1;
+    vxc_int4 tmpSqr1;
+    short zp = inputZP;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+        tmpSum += (tmpSum1.x);
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
+    }
+    sum = (tmpSum + sumInZp) * input_scale;
+    sqr = (tmpSqr + tmpZp2) * e2InScale;
+
+    float mean, vari;
+    mean = sum * dimRatio;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+    int2 coord_bias = (int2)(0, 0);
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.x = coord.x;
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert2ndUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert3rdUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert4thUint8SubZpToFp32_4x4);
+        tmpData0 *= input_scale;
+        tmpData1 *= input_scale;
+        tmpData2 *= input_scale;
+        tmpData3 *= input_scale;
+
+        vxc_float4 norm;
+        tmpData0 -= mean;
+        norm = scale_f0 * vari * tmpData0 + bias_f0;
+        bias_f0 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        coord_bias.x += 4;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+
+        tmpData1 -= mean;
+        norm = scale_f1 * vari * tmpData1 + bias_f1;
+        bias_f1 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+
+        tmpData2 -= mean;
+        norm = scale_f0 * vari * tmpData2 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+
+        tmpData3 -= mean;
+        norm = scale_f1 * vari * tmpData3 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+        coord_out.x = coord.x;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \
+                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    }
+}
+/***************************layernorm float16 to uint8**************************/
+__kernel void layer_norm_F16toU8(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    vxc_short8 src0, src1;
+    vxc_float sum = 0, sqr = 0;
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
+    {
+        vxc_half8  val0_h;
+        _viv_asm(COPY, val0_h, src0, 16);
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+            uniFp16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr += sumsqr.y;
+    }
+    vxc_float mean;
+    mean = sum * dimRatio;
+    vxc_float vari;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4 bias_f;
+    for(coord.x = 0; coord.x < width; coord.x += 4)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        bias_f = read_imagef(bias, coord.xw);
+        vxc_half8 in_h, scale_h;
+        _viv_asm(COPY, in_h, src0, 16);
+        _viv_asm(COPY, scale_h, src1, 16);
+        vxc_float4 in_f, scale_f;
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        vxc_float4 sub, norm;
+        sub = in_f - mean;
+        norm = scale_f * vari * sub + bias_f;
+        norm = norm * outputScale + output_zp;
+        int4 output_int4;
+        output_int4 = convert_int4_rte(norm);
+        vxc_uchar8 dst;
+        VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),
+            uniConvertInt32toUint8_2x8);
+        coord_out.x = coord.x;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize.vx
@ -7,12 +7,9 @@ _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
 _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
 _viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;

-__kernel void vxcLayerNorm(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t output,
-              float eps)
+__kernel void layer_norm_F16toF16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
 {
    int4 coord = (int4)(0, get_global_id(1), 0, 0);
    vxc_short8 src0, src1;
@ -44,7 +41,7 @@ __kernel void vxcLayerNorm(
            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        bias_f = read_imagef(bias, coord.xwww);
+        bias_f = read_imagef(bias, coord.xw);
        vxc_half8 in_h, scale_h;
        _viv_asm(COPY, in_h, src0, 16);
        _viv_asm(COPY, scale_h, src1, 16);
@ -76,7 +73,7 @@ _viv_uniform VXC_512Bits uniSqrSum_16x1;
 _viv_uniform float input_scale;
 _viv_uniform int inputZP;
 _viv_uniform float outputScale;
-_viv_uniform int output_ZP;
+_viv_uniform float output_zp;
 _viv_uniform int sumInZp;
 _viv_uniform int tmpZp1;
 _viv_uniform int tmpZp2;
@ -84,12 +81,9 @@ _viv_uniform float e2InScale;
 _viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;

-__kernel void vxcLayerNorm_u8(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t output,
-              float eps)
+__kernel void layer_norm_U8toU8_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
 {
    int4 coord = (int4)(0, get_global_id(1), 0, 0);
    vxc_uchar16 src0, src2;
@ -121,15 +115,15 @@ __kernel void vxcLayerNorm_u8(
    vari = rsqrt(vari);
    vxc_int4 tmpVal0, tmpVal1;
    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    int4 coord_bias = (int4)(0, 0, 0, 0);
+    int2 coord_bias = (int2)(0, 0);

    for(coord.x = 0; coord.x < width; coord.x += 16)
    {
-        coord_bias.x = coord.x;
        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.x = coord.x;
        _viv_asm(COPY, scale_h, src1, 16);
        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
            UniFP16toFP32Lo4_dp4x4);
@ -151,49 +145,41 @@ __kernel void vxcLayerNorm_u8(
            uniConvert3rdUint8SubZpToFp32_4x4);
        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
            uniConvert4thUint8SubZpToFp32_4x4);
-        tmpData0 *= input_scale;
-        tmpData1 *= input_scale;
-        tmpData2 *= input_scale;
-        tmpData3 *= input_scale;
+        tmpData0 = tmpData0 * input_scale - mean;
+        tmpData1 = tmpData1 * input_scale - mean;
+        tmpData2 = tmpData2 * input_scale - mean;
+        tmpData3 = tmpData3 * input_scale - mean;

        vxc_float4 norm;
-        tmpData0 -= mean;
        norm = scale_f0 * vari * tmpData0 + bias_f0;
        bias_f0 = read_imagef(bias, coord_bias);
        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
            UniFP16toFP32Lo4_dp4x4);
        coord_bias.x += 4;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);

-        tmpData1 -= mean;
        norm = scale_f1 * vari * tmpData1 + bias_f1;
        bias_f1 = read_imagef(bias, coord_bias);
        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
            uniConvertSecFp16Fp32_4x4);
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
            uniConvertInt32toUint8_2x8);

-        tmpData2 -= mean;
        norm = scale_f0 * vari * tmpData2 + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);

-        tmpData3 -= mean;
        norm = scale_f1 * vari * tmpData3 + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
            uniConvertInt32toUint8_2x8);
        VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    }
 }
 /***************************layernorm float16 to uint8**************************/
-_viv_uniform float outputZP;
-__kernel void vxcLayerNormFP16toU8(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t output,
-              float eps)
+__kernel void layer_norm_F16toU8_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
 {
    int4 coord = (int4)(0, get_global_id(1), 0, 0);
    vxc_short8 src0, src1;
@ -225,7 +211,7 @@ __kernel void vxcLayerNormFP16toU8(
            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        bias_f = read_imagef(bias, coord.xwww);
+        bias_f = read_imagef(bias, coord.xw);
        vxc_half8 in_h, scale_h;
        _viv_asm(COPY, in_h, src0, 16);
        _viv_asm(COPY, scale_h, src1, 16);
@ -237,7 +223,7 @@ __kernel void vxcLayerNormFP16toU8(
        vxc_float4 sub, norm;
        sub = in_f - mean;
        norm = scale_f * vari * sub + bias_f;
-        norm = norm * outputScale + outputZP;
+        norm = norm * outputScale + output_zp;
        int4 output_int4;
        output_int4 = convert_int4_rte(norm);
        vxc_uchar8 dst;
@ -245,4 +231,4 @@ __kernel void vxcLayerNormFP16toU8(
            uniConvertInt32toUint8_2x8);
        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
    }
-}
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx
@ -0,0 +1,167 @@
+#include "cl_viv_vx_ext.h"
+
+/**************************layernorm float16***********************************/
+_viv_uniform int width;
+_viv_uniform float dimRatio;
+_viv_uniform float dimRatio_scale;
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform float e2InScale;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+
+__kernel void layer_norm_I16toI16(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+
+    vxc_short8 src0, src1, dst;
+    vxc_float sum = 0, sqr = 0;
+    for(; coord.x < width;)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.x += 8;
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                    uniInt16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr = sqr + sumsqr.y * e2InScale;
+    }
+    vxc_float mean;
+    mean = sum * dimRatio_scale;
+    vxc_float vari;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_half8 scale_h;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    int2 coord_bias = (int2)(0, 0);
+
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.x = coord.x;
+        VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 * input_scale - mean;
+        norm = scale_f0 * vari * sub + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        sub = tmpData1 * input_scale - mean;
+        norm = scale_f1 * vari * sub + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void layer_norm_I16toI16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(0, get_global_id(1));
+
+    vxc_short8 src0, src1, dst;
+    vxc_float sum = 0, sqr = 0;
+    for(; coord.x < width;)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.x += 8;
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                    uniInt16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr = sqr + sumsqr.y * e2InScale;
+    }
+    vxc_float mean, vari;
+    mean = sum * dimRatio_scale;
+    vari = sqr * dimRatio - mean * mean;
+    vari += eps;
+    vari = rsqrt(vari);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_half8 scale_h;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    int2 coord_bias = (int2)(0, 0);
+
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.x = coord.x;
+        VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                    UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                    uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 * input_scale - mean;
+        norm = scale_f0 * vari * sub + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        sub = tmpData1 * input_scale - mean;
+        norm = scale_f1 * vari * sub + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                    uniConvertInt32toUint8_2x8);
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx
@ -0,0 +1,252 @@
+#include "cl_viv_vx_ext.h"
+
+/*****************************layernorm uint8 to fp16****************************/
+_viv_uniform int width;
+_viv_uniform float dimRatio;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniSumU8_16x1;
+_viv_uniform VXC_512Bits uniSqrSum_16x1;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+_viv_uniform int sumInZp;
+_viv_uniform int tmpZp1;
+_viv_uniform int tmpZp2;
+_viv_uniform float e2InScale;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits UniPackFP16even_2x8;
+
+__kernel void layer_norm_U8toF16(
+    image2d_array_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_array_t output,
+              float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+    vxc_uchar16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0;
+    vxc_int4 tmpSum1;
+    vxc_int4 tmpSqr1;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+        tmpSum += (tmpSum1.x);
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
+    }
+    sum = (tmpSum + sumInZp) * input_scale;
+    sqr = (tmpSqr + tmpZp2) * e2InScale;
+
+    float mean, vari;
+    mean = sum * dimRatio;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+    int2 coord_bias = (int2)(0, 0);
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_short8 src1, outval;
+    short zp = inputZP;
+    half4 tmpVal0, tmpVal1;
+    vxc_half8 dst;
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.x = coord.x;
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert2ndUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert3rdUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert4thUint8SubZpToFp32_4x4);
+        tmpData0 *= input_scale;
+        tmpData1 *= input_scale;
+        tmpData2 *= input_scale;
+        tmpData3 *= input_scale;
+
+        vxc_float4 norm;
+        tmpData0 -= mean;
+        norm = scale_f0 * vari * tmpData0 + bias_f0;
+        bias_f0 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        coord_bias.x += 4;
+        _viv_asm(CONV, tmpVal0, norm);
+
+        tmpData1 -= mean;
+        norm = scale_f1 * vari * tmpData1 + bias_f1;
+        bias_f1 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            UniPackFP16even_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        coord_out.x = coord.x;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+        tmpData2 -= mean;
+        norm = scale_f0 * vari * tmpData2 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+
+        tmpData3 -= mean;
+        norm = scale_f1 * vari * tmpData3 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            UniPackFP16even_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        coord_out.x += 8;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void layer_norm_U8toF16_2D(
+    image2d_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_t output,
+        float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);
+    vxc_uchar16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0;
+    vxc_int4 tmpSum1;
+    vxc_int4 tmpSqr1;
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+        tmpSum += (tmpSum1.x);
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
+    }
+    sum = (tmpSum + sumInZp) * input_scale;
+    sqr = (tmpSqr + tmpZp2) * e2InScale;
+
+    float mean, vari;
+    mean = sum * dimRatio;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+    int2 coord_bias = (int2)(0, 0);
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_short8 src1, outval;
+    short zp = inputZP;
+    half4 tmpVal0, tmpVal1;
+    vxc_half8 dst;
+
+    int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        coord_bias.x = coord.x;
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert2ndUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert3rdUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert4thUint8SubZpToFp32_4x4);
+        tmpData0 *= input_scale;
+        tmpData1 *= input_scale;
+        tmpData2 *= input_scale;
+        tmpData3 *= input_scale;
+
+        vxc_float4 norm;
+        tmpData0 -= mean;
+        norm = scale_f0 * vari * tmpData0 + bias_f0;
+        bias_f0 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        coord_bias.x += 4;
+        _viv_asm(CONV, tmpVal0, norm);
+
+        tmpData1 -= mean;
+        norm = scale_f1 * vari * tmpData1 + bias_f1;
+        bias_f1 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            UniPackFP16even_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        coord_out.x = coord.x;
+        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+        tmpData2 -= mean;
+        norm = scale_f0 * vari * tmpData2 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+
+        tmpData3 -= mean;
+        norm = scale_f1 * vari * tmpData3 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            UniPackFP16even_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        coord_out.x += 8;
+        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx
@ -0,0 +1,426 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
+_viv_uniform int width;
+
+_viv_uniform int height;
+
+_viv_uniform int height_depth;
+_viv_uniform float dimRatio;
+_viv_uniform int group_num;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    vxc_float4 sumsqr;
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            _viv_asm(COPY, in_h, src0, 16);
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                uniFp16SumSqr_dp8x2);
+            tmpSumSqr += sumsqr;
+        }
+    }
+
+    lcl_sum[lidx] = tmpSumSqr.x;
+    lcl_sqr[lidx] = tmpSumSqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        float sum = 0;
+        float sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32_2D(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int gidy = gidz * height;
+
+    int2 coord = (int2)(gidx, gidy);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    vxc_float4 sumsqr;
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int endH = gidy + height;
+    if(gidx < width)
+    {
+        for(; coord.y < endH;)
+        {
+            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            _viv_asm(COPY, in_h, src0, 16);
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                uniFp16SumSqr_dp8x2);
+            tmpSumSqr += sumsqr;
+        }
+    }
+
+    lcl_sum[lidx] = tmpSumSqr.x;
+    lcl_sqr[lidx] = tmpSumSqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        float sum = 0;
+        float sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    vxc_half8 dst;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_para.y = coord.y;
+        coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 - mean_vari.s0;
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        sub = tmpData1 - mean_vari.s0;
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                    uniConvertHalfToFp16_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+    int2 coord_bias = (int2)(0, 0);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_bias);
+        coord_bias.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    coord_bias = coord;
+
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    vxc_half8 dst;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.y = coord.y;
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 - mean_vari.s0;
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        sub = tmpData1 - mean_vari.s0;
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                    uniConvertHalfToFp16_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1;
+    vxc_uchar16 outval;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_para.y = coord.y;
+        coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 - mean_vari.s0;
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        sub = tmpData1 - mean_vari.s0;
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8_2D(
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+    int2 coord_bias = (int2)(0, 0);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_bias);
+        coord_bias.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    coord_bias = coord;
+
+    vxc_float4  tmpData0, tmpData1;
+    vxc_uchar16 outval;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.y = coord.y;
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 - mean_vari.s0;
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        sub = tmpData1 - mean_vari.s0;
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx
@ -0,0 +1,266 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
+_viv_uniform float e2InScale;
+_viv_uniform int width;
+
+_viv_uniform float input_scale;
+_viv_uniform int height;
+
+_viv_uniform int height_depth;
+_viv_uniform float dimRatio;
+_viv_uniform int group_num;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform int inputZP;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_short8 src0;
+    float4 tmpSumSqr = (float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            vxc_float4 sumsqr;
+            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                    uniInt16SumSqr_dp8x2);
+            tmpSumSqr += sumsqr;
+        }
+        tmpSumSqr.x *= input_scale;
+        tmpSumSqr.y *= e2InScale;
+    }
+    lcl_sum[lidx] = tmpSumSqr.x;
+    lcl_sqr[lidx] = tmpSumSqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        float4 data = (float4)(0);
+        for(int i = 0; i < 4; i++)
+        {
+            data.x += dot(tmp_sum[i], one);
+            data.y += dot(tmp_sqr[i], one);
+        }
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32_2D(
+    image2d_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int gidy = gidz * height;
+
+    int2 coord = (int2)(gidx, gidy);
+    vxc_short8 src0;
+    float4 tmpSumSqr = (float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int endH = gidy + height;
+    if(gidx < width)
+    {
+        for(; coord.y < endH;)
+        {
+            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            vxc_float4 sumsqr;
+            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                    uniInt16SumSqr_dp8x2);
+            tmpSumSqr += sumsqr;
+        }
+        tmpSumSqr.x *= input_scale;
+        tmpSumSqr.y *= e2InScale;
+    }
+    lcl_sum[lidx] = tmpSumSqr.x;
+    lcl_sqr[lidx] = tmpSumSqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        float4 data = (float4)(0);
+        for(int i = 0; i < 4; i++)
+        {
+            data.x += dot(tmp_sum[i], one);
+            data.y += dot(tmp_sqr[i], one);
+        }
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);
+    vxc_short8 src0, src1, outval;
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_para.y = coord.y;
+        coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+    int2 coord_bias = (int2)(0, 0);
+    vxc_short8 src0, src1, outval;
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_bias);
+        coord_bias.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    coord_bias = coord;
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.y = coord.y;
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx
@ -0,0 +1,419 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniSumU8_16x1;
+_viv_uniform VXC_512Bits uniSqrSum_16x1;
+_viv_uniform int sumInZp;
+_viv_uniform int tmpZp1;
+_viv_uniform float e2InScale;
+_viv_uniform float rowSumScale;
+_viv_uniform int width;
+
+_viv_uniform float input_scale;
+_viv_uniform int height;
+
+_viv_uniform int height_depth;
+_viv_uniform float dimRatio;
+_viv_uniform int group_num;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform int inputZP;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_uchar16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+            tmpSum += (tmpSum1);
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
+        }
+        sqr += (tmpSqr * e2InScale + rowSumScale);
+        sum = (tmpSum + sumInZp) * input_scale;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32_2D(
+    image2d_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int gidy = gidz * height;
+
+    int2 coord = (int2)(gidx, gidy);
+    vxc_uchar16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int endH = gidy + height;
+    if(gidx < width)
+    {
+        for(; coord.y < endH;)
+        {
+            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+            tmpSum += (tmpSum1);
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
+        }
+        sqr += (tmpSqr * e2InScale + rowSumScale);
+        sum = (tmpSum + sumInZp) * input_scale;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);
+    vxc_uchar16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    half4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_para.y = coord.y; coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniConvertHalfToFp16_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+    int2 coord_bias = (int2)(0, 0);
+    vxc_uchar16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_bias);
+        coord_bias.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    coord_bias = coord;
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    half4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.y = coord.y;
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniConvertHalfToFp16_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);
+    vxc_uchar16 src0 , outval;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_para.y = coord.y;
+        coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8_2D(
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+    int2 coord_bias = (int2)(0, 0);
+    vxc_uchar16 src0, outval;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_bias);
+        coord_bias.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    coord_bias = coord;
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.y = coord.y;
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx
@ -1,136 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;
-_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
-
-_viv_uniform VXC_512Bits uniBilinearTmp1BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp2BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp3BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp4BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp5BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp6BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp7BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp8BgraShort_4x4;
-
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniExtractInt32BgraToU8Bgr_2x8;
-
-_viv_uniform int zp;
-_viv_uniform float outputScale;
-
-__kernel void pre_process_bgra_scale_nhwc_U8toU8(
-    __read_only image2d_array_t input, __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    int4 gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    gidx += (int4)(0, 1, 2, 3);
-
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
-    int4 sx = fx & 0xffff8000; // Floor
-    int fy, sy;
-    fx -= sx;
-    sx = sx >> 15;
-    fx = (fx +(1 << 4)) >> 5;
-
-    // for y
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
-    sy = fy & 0xffff8000; // Floor
-    fy -= sy;
-    sy = sy >> 15;
-
-    sy = sy < 0 ? 0 : sy;
-    fy = fy < 0 ? 0 : fy;
-
-    fy = (fy + (1<< 4)) >> 5;
-    sx = (sx + (*xOffset)) * 4 ;
-    sy += (*yOffset);
-    int4 srcPos = (int4)(sx.x, sy, sy + 1, sx.y);
-    vxc_uchar16 lineBGRA0, lineBGRA1, lineBGRA2, lineBGRA3;
-    vxc_uchar16 dataB, dataG, dataR;
-
-    VXC_ReadImage(lineBGRA0, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(lineBGRA0, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(lineBGRA1, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(lineBGRA1, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.z;
-    srcPos.w = sx.w;
-
-    VXC_ReadImage(lineBGRA2, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(lineBGRA2, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(lineBGRA3, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(lineBGRA3, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_uchar4 val_u8;
-    int4 tmp1, tmp2, result1, result2;
-    float4 tmpDst, tmp0;
-    float4 mean = (float4)(bMean, gMean, rMean, 0);
-    //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);
-    int tmpV = 1 << 19;
-    vxc_short8 tmpFx;
-    VXC_DP2x8(tmpFx, fx, fx, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
-                 uniConvertInt32toUint8_2x8);
-    //tmpFx = fx.xxxx;
-    VXC_DP4x4(tmp1, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
-                 uniBilinearTmp1BgraShort_4x4);
-    VXC_DP4x4(tmp2, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
-                 uniBilinearTmp2BgraShort_4x4);
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
-        uniConvertIntergetoF32_4x4);
-    tmpDst = (tmp0 - mean) * var;
-    result1 = convert_int4_rte(tmpDst * outputScale + zp);
-
-    //tmpFx = fx.yyyy;
-    VXC_DP4x4(tmp1, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3BgraShort_4x4);
-    VXC_DP4x4(tmp2, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4BgraShort_4x4);
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
-        uniConvertIntergetoF32_4x4);
-    tmpDst = (tmp0 - mean) * var;
-    result2 = convert_int4_rte(tmpDst * outputScale + zp);
-
-    vxc_uchar16 dst;
-    VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1),
-                 uniExtractInt32BgraToU8Bgr_2x8);
-
-    //tmpFx = fx.zzzz;
-    VXC_DP4x4(tmp1, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp5BgraShort_4x4);
-    VXC_DP4x4(tmp2, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp6BgraShort_4x4);
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
-        uniConvertIntergetoF32_4x4);
-    tmpDst = (tmp0 - mean) * var;
-    result1 = convert_int4_rte(tmpDst * outputScale + zp);
-
-    //tmpFx = fx.wwww;
-    VXC_DP4x4(tmp1, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp7BgraShort_4x4);
-    VXC_DP4x4(tmp2, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp8BgraShort_4x4);
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
-        uniConvertIntergetoF32_4x4);
-    tmpDst = (tmp0 - mean) * var;
-    result2 = convert_int4_rte(tmpDst * outputScale + zp);
-
-    VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(6, 11, 0, VXC_RM_ToNearestEven, 1),
-                 uniExtractInt32BgraToU8Bgr_2x8);
-
-    int4 dstPos = (int4)(get_global_id(0) * 3, gidy, 0, 0);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx
@ -1,89 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-
-_viv_uniform float outputScaleVar;
-_viv_uniform float bMeanScaleVarZp;
-_viv_uniform float gMeanScaleVarZp;
-_viv_uniform float rMeanScaleVarZp;
-
-_viv_uniform uint xrIntFloat_16;
-_viv_uniform uint yrIntFloat_16;
-
-_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
-_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
-_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
-
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
-
-_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;
-_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;
-
-__kernel void pre_process_nv12_trans_U8toU8(
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,
-    __write_only image2d_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    uint4 gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    gidx += (uint4)(0, 1, 2, 3);
-
-    uint dy = (gidy * yrIntFloat_16) >> 16;
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;
-    int sy = convert_int(dy) + (*yOffset);
-    int4 sx = convert_int4(dx) + (*xOffset);
-    int4 uvX = sx & 0xfffffffe;
-    int uvY = sy >> 1;
-
-    vxc_uchar16 Y, UV;
-    int2 coord = (int2)(sx.x, sy);
-    int2 coord_uv = (int2)(uvX.x, uvY);
-
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.y;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.z;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.w;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.y;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.z;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.w;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_char16 tmpUV;
-    short tmpVal = 128;
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
-
-    float4 tmpDstB, tmpDstG, tmpDstR;
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
-
-    int4 result, dstR, dstG, dstB;
-    vxc_uchar16 dst, tmpPack;
-    dstB = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
-    dstG = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
-    dstR = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
-
-    if(bOrder == 2)
-    {
-        int4 exchangeData = dstB;
-        dstB = dstR;
-        dstR = exchangeData;
-    }
-
-    VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);
-
-    int2 dstPos = (int2)(get_global_id(0) * 3, gidy);
-    VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx
@ -1,94 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform float outputScale;
-_viv_uniform float outputZP;
-_viv_uniform VXC_512Bits uniNormilizationLo_2x8;
-_viv_uniform VXC_512Bits uniNormilizationHi_2x8;
-#define IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(dst_name, dst_type, copy_type) \
-__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \
-    ( \
-    __read_only image2d_array_t  input, \
-    __write_only image2d_array_t output, \
-         global int              *xRatio, \
-         global int              *yRatio, \
-         global int              *xOffset, \
-         global int              *yOffset, \
-                float            rMean, \
-                float            gMean, \
-                float            bMean, \
-                float            f32Var, \
-                int           reverse_channel, \
-                int           trans \
-    ) \
-{ \
-    int2 coord      = (int2)(get_global_id(0), get_global_id(1)); \
- \
-    coord.xy += (int2) (*xOffset, *yOffset); \
-    vxc_uchar16 src0, src1; \
-    dst_type   dst0, dst1; \
-    copy_type   dst; \
- \
-    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
- \
-    f32Var *= outputScale; \
-    float4  paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \
-        bMean * f32Var - outputZP, f32Var); \
-    half4 paramData_f16; \
-    _viv_asm(CONV, paramData_f16, paramData); \
- \
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0)); \
-    coord_out.z = coord_out.x + 8; \
- \
-    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
-        uniNormilizationLo_2x8); \
-    VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), \
-        uniNormilizationHi_2x8); \
-    _viv_asm(COPY, dst, dst0, 16); \
-    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-    _viv_asm(COPY, dst, dst1, 16); \
-    VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \
-}
-IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(I16, vxc_short8,  vxc_short8)
-IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(F16, vxc_half8,   vxc_short8)
-
-#define IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(dst_name, dst_type) \
-__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \
-    ( \
-    __read_only image2d_array_t  input, \
-    __write_only image2d_array_t output, \
-         global int              *xRatio, \
-         global int              *yRatio, \
-         global int              *xOffset, \
-         global int              *yOffset, \
-                float            rMean, \
-                float            gMean, \
-                float            bMean, \
-                float            f32Var, \
-                int              reverse_channel, \
-                int              trans \
-    ) \
-{ \
-    int2 coord      = (int2)(get_global_id(0), get_global_id(1)); \
-    coord.xy += (int2) (*xOffset, *yOffset); \
-    vxc_uchar16 src0, src1; \
-    dst_type dst; \
- \
-    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
- \
-    f32Var *= outputScale; \
-    float4  paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \
-        bMean * f32Var - outputZP, f32Var); \
- \
-    half4 paramData_f16; \
-    _viv_asm(CONV, paramData_f16, paramData); \
- \
-    int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \
- \
-    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
-        uniNormilizationLo_2x8); \
-    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), \
-        uniNormilizationHi_2x8); \
-    VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); \
-}
-IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(U8, vxc_uchar16)
-IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(I8, vxc_char16)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx
@ -1,172 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniVecShift10;
-_viv_uniform VXC_512Bits uniAddRShift;
-_viv_uniform VXC_512Bits uniGetTempVal;
-_viv_uniform VXC_512Bits uniExtractBytes;
-_viv_uniform VXC_512Bits uniUnpackToR;
-_viv_uniform VXC_512Bits uniUnpackToG;
-_viv_uniform VXC_512Bits uniUnpackToB;
-
-_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
-_viv_uniform float outputScale;
-_viv_uniform VXC_512Bits uniExtract8Data_2x8;
-_viv_uniform float outputZP;
-
-_viv_uniform VXC_512Bits uniRePackRGBLo_2x8;
-_viv_uniform VXC_512Bits uniRePackRGBHi_2x8;
-#define IMAGE_PRE_PROCESS_NHWC(dst_name, conv_type, dst_type, copy_type) \
-__kernel void pre_process_rgb_scale_nhwc_U8to##dst_name \
-    ( \
-__read_only image2d_array_t  input, \
-__write_only image2d_array_t output, \
-        global int           *xRatio, \
-        global int           *yRatio, \
-        global int           *xOffset, \
-        global int           *yOffset, \
-               float         rMean, \
-               float         gMean, \
-               float         bMean, \
-               float         f32Var, \
-               int           reverse_channel, \
-               int           trans \
-    ) \
-{ \
-    int2 ratioXY = (int2)(*xRatio, *yRatio); \
-    int4 xPos       = get_global_id(0); \
-    int yPos        = get_global_id(1); \
-    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
-    xPos += (int4)(0, 1, 2, 3); \
- \
-    /*x*/ \
-    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
-    int4 sx = fx0 & 0xffff8000; \
-    fx0 -= sx; \
-    sx = sx >> 15; \
- \
-    vxc_short4 fx; \
-    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \
-    /*y*/ \
-    int fy = yPos * ratioXY.y + ratioSufXY.y; \
-    int sy = fy & 0xffff8000; \
- \
-    fy -= sy; \
-    sy = sy >> 15; \
- \
-    fy = (fy + (1<< 4)) >> 5; \
- \
-    vxc_uchar16 line0RGB1, line0RGB2; \
-    vxc_uchar16 line1RGB3, line1RGB4; \
-    int4 coord; \
-    sx = sx * 3 + *xOffset; \
-    coord.xyz    = sx.xyz; \
-    coord.w        = sy + *yOffset; \
-    int2 coord1 = (int2)(sx.w, coord.w); \
-    VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
- \
-    VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
-        VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
-        VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
-        VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \
-        VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
- \
-    float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \
- \
-    bgrMean *= f32Var; \
- \
-    int4 test01, temp1; \
-    int4 test02, temp2; \
-    int4 tt; \
-    vxc_uchar4 val; \
-    int4 coord_out = (int4)(xPos.x * 3, yPos, xPos.x * 3 + 6, 0); \
- \
-    vxc_uchar8 line1, line2; \
- \
-    /*R*/ \
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \
- \
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp1 = temp1 + test01; \
- \
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp2 = temp2 + test02; \
-    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
- \
-    vxc_float4 tmp_dst; \
-    vxc_uchar4 u8_dst; \
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \
-    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
-        uniConvertIntergetoF32_4x4); \
- \
-    /*convert U8 to dst*/ \
-    dst_type dstRG, dstB, dst; \
-    tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \
-    tmp_dst = tmp_dst * outputScale + outputZP; \
-    conv_type dst0; \
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \
-    VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
- \
-    /*G*/ \
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \
- \
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp1 = temp1 + test01; \
- \
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp2 = temp2 + test02; \
-    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
- \
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \
-    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
-        uniConvertIntergetoF32_4x4); \
- \
-    tmp_dst = tmp_dst * f32Var - bgrMean.y; \
-    tmp_dst = tmp_dst * outputScale + outputZP; \
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \
-    VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
- \
-    /*B*/ \
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \
- \
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp1 = temp1 + test01; \
- \
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp2 = temp2 + test02; \
-    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
- \
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \
-    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
-        uniConvertIntergetoF32_4x4); \
- \
-    tmp_dst = tmp_dst * f32Var - bgrMean.x; \
-    tmp_dst = tmp_dst * outputScale + outputZP; \
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \
-    VXC_DP2x8(dstB, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
-    VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBLo_2x8); \
-    copy_type result; \
-    _viv_asm(COPY, result, dst, 16); \
-    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBHi_2x8); \
-    _viv_asm(COPY, result, dst, 16); \
-    VXC_WriteImage(output, coord_out.zy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-}
-IMAGE_PRE_PROCESS_NHWC(U8,  uint4, vxc_uchar16, vxc_uchar16)
-IMAGE_PRE_PROCESS_NHWC(I8,  int4,  vxc_char16,  vxc_char16)
-IMAGE_PRE_PROCESS_NHWC(I16, int4,  vxc_short8,  vxc_short8)
-IMAGE_PRE_PROCESS_NHWC(F16, half4, vxc_half8,   vxc_short8)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx
@ -23,19 +23,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;
 _viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;
 _viv_uniform VXC_512Bits uniCalculateB1st_4x4;

-_viv_uniform VXC_512Bits uniPackBG0_2x8;
-_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;
-_viv_uniform VXC_512Bits uniPackRB0_2x8;
-_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;
-_viv_uniform VXC_512Bits uniPackGR1_2x8;
-_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;
-_viv_uniform VXC_512Bits uniPackBG1_2x8;
-_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;
-_viv_uniform VXC_512Bits uniPackRB2_2x8;
-_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;
-_viv_uniform VXC_512Bits uniPackGR2_2x8;
-_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;
-
 _viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;
 _viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;
 _viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;
@ -145,137 +132,3 @@ __kernel void pre_process_yuv420_copy_U8toU8(
    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 }

-// store bgrbgrbgr
-__kernel void pre_process_yuv420_copy_trans_U8(
-    __read_only image2d_t            y_img,
-    __read_only image2d_t            u_img,
-    __read_only image2d_t            v_img,
-    __write_only image2d_array_t    output,
-        global int *                xRatio,
-        global int *                yRatio,
-        global int *               xOffset,
-        global int *               yOffset,
-               float                 rMean,
-               float                 gMean,
-               float                 bMean,
-               float                   var,
-               int         reverse_channel,
-               int                   trans
-    )
-{
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
-    int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0);
-    vxc_uchar16 Y;
-    vxc_uchar8 U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-    vxc_uchar16 dst;
-
-    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    var *= outputScale;
-    float4  paramData = (float4)(bMean * var - zp, gMean * var - zp,\
-        rMean * var - zp, var);
-    half4 paramData_f16;
-    _viv_asm(CONV, paramData_f16, paramData);
-
-    //C = Y - 16;
-    //D = U - 128;
-    //E = V - 128;
-    // calculate R
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
-
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
-
-    // calculate G
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG;
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP4x4(dst, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
-    VXC_DP4x4(dst, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
-    VXC_DP4x4(dst, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);
-    VXC_DP4x4(dst, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);
-
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
-
-    // calculate B
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
-
-    // reorder to bgr
-    vxc_uchar8 tmpdst0, tmpdst1;
-    vxc_uchar16 dst0, dst1, dst2;
-
-    if(bOrder == 2)
-    {
-        vxc_uchar16 exchangeData = B;
-        B = R;
-        R = exchangeData;
-    }
-
-    // BGR BGR BG
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);
-    VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);
-
-    // RBG RBG RB
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);
-    VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);
-
-    pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);
-
-    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.x += 16;
-
-    // GRB GRB GR
-    VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);
-    VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);
-
-    // BGR BGR BG
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);
-    VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);
-
-    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.x += 16;
-
-    // RBG RBG RB
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);
-    VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);
-
-    // GRB GRB GR
-    VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);
-    VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);
-
-    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx
@ -1,235 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
-
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
-
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
-
-_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;
-_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-_viv_uniform int zp;
-_viv_uniform float outputScale;
-
-__kernel void pre_process_yuv420_trans_U8toU8(
-    __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,
-    __read_only image2d_array_t v_img, __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    int4 gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    gidx += (int4)(0, 1, 2, 3);
-
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
-    int4 sx = fx & 0xffff8000; // Floor
-    int fy, sy;
-    fx -= sx;
-    sx = sx >> 15;
-    fx = (fx +(1 << 4)) >> 5;
-
-    // for y
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
-    sy = fy & 0xffff8000; // Floor
-    fy -= sy;
-    sy = sy >> 15;
-
-    sy = sy < 0 ? 0 : sy;
-    fy = fy < 0 ? 0 : fy;
-
-    fy = (fy + (1<< 4)) >> 5;
-    sx += (*xOffset);
-    sy += (*yOffset);
-    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);
-    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);
-    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);
-
-    vxc_uchar16 Y, U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.y;
-    srcPos1.x = sx.y >> 1;
-    srcPos2.x = sx.y >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.z;
-    srcPos1.x = sx.z >> 1;
-    srcPos2.x = sx.z >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.w;
-    srcPos1.x = sx.w >> 1;
-    srcPos2.x = sx.w >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-
-    //C = Y - 16; D = U - 128; E = V - 128;
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG, tmpDstG1;
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    int4 result, temp1, temp2, dstR, dstG, dstB;
-    int4 tmpData0, tmpData1;
-
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    // temp2 - temp1
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-
-    tmpV = 1 << 19;
-    vxc_uchar8 dst, tmpPack;
-    float4 tmpDst;
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - bMean) * var;
-    dstB = convert_int4_rte(tmpDst * outputScale + zp);
-
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - gMean) * var;
-    dstG = convert_int4_rte(tmpDst * outputScale + zp);
-
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - rMean) * var;
-    dstR = convert_int4_rte(tmpDst * outputScale + zp);
-
-    if(bOrder == 2)
-    {
-        int4 exchangeData = dstB;
-        dstB = dstR;
-        dstR = exchangeData;
-    }
-
-    VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);
-
-    int2 dstPos = (int2)(get_global_id(0) * 3, gidy);
-    VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
@ -22,19 +22,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;
 _viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;
 _viv_uniform VXC_512Bits uniCalculateB1st_4x4;

-_viv_uniform VXC_512Bits uniPackBG0_2x8;
-_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;
-_viv_uniform VXC_512Bits uniPackRB0_2x8;
-_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;
-_viv_uniform VXC_512Bits uniPackGR1_2x8;
-_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;
-_viv_uniform VXC_512Bits uniPackBG1_2x8;
-_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;
-_viv_uniform VXC_512Bits uniPackRB2_2x8;
-_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;
-_viv_uniform VXC_512Bits uniPackGR2_2x8;
-_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;
-
 _viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;
 _viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;
 _viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;
@ -143,137 +130,3 @@ __kernel void pre_process_yuv444_copy_U8toU8(
    pos.z = rOrder;
    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 }
-
-// store bgrbgrbgr
-__kernel void pre_process_yuv444_copy_trans_U8(
-    __read_only image2d_t            y_img,
-    __read_only image2d_t            u_img,
-    __read_only image2d_t            v_img,
-    __write_only image2d_array_t    output,
-        global int *                xRatio,
-        global int *                yRatio,
-        global int *               xOffset,
-        global int *               yOffset,
-               float                 rMean,
-               float                 gMean,
-               float                 bMean,
-               float                   var,
-               int         reverse_channel,
-               int                   trans
-    )
-{
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
-    vxc_uchar16 Y, U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-    vxc_uchar16 dst;
-
-    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    var *= outputScale;
-    float4  paramData = (float4)(bMean * var - zp, gMean * var - zp,\
-        rMean * var - zp, var);
-    half4 paramData_f16;
-    _viv_asm(CONV, paramData_f16, paramData);
-
-    //C = Y - 16;
-    //D = U - 128;
-    //E = V - 128;
-    // calculate R
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
-
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
-
-    // calculate G
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG0, tmpDstG1;
-    VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8);
-    VXC_DP4x4(dst, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
-    VXC_DP4x4(dst, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
-    VXC_DP4x4(dst, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
-    VXC_DP4x4(dst, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
-
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
-
-    // calculate B
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
-
-    // reorder to bgr
-    vxc_uchar8 tmpdst0, tmpdst1;
-    vxc_uchar16 dst0, dst1, dst2;
-
-    if(bOrder == 2)
-    {
-        vxc_uchar16 exchangeData = B;
-        B = R;
-        R = exchangeData;
-    }
-
-    // BGR BGR BG
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);
-    VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);
-
-    // RBG RBG RB
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);
-    VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);
-
-    pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);
-
-    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.x += 16;
-
-    // GRB GRB GR
-    VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);
-    VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);
-
-    // BGR BGR BG
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);
-    VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);
-
-    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.x += 16;
-
-    // RBG RBG RB
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);
-    VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);
-
-    // GRB GRB GR
-    VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);
-    VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);
-
-    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx
@ -1,196 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
-
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
-
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
-
-_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;
-_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-_viv_uniform int zp;
-_viv_uniform float outputScale;
-
-#define IMAGE_PRE_PROCESS_YUV444_TRANS(dst_name, dst_type) \
-__kernel void pre_process_yuv444_trans_U8to##dst_name( \
-    __read_only image2d_t y_img, __read_only image2d_t u_img, \
-    __read_only image2d_t v_img, __write_only image2d_t    output, \
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \
-{ \
-    int4 gidx = get_global_id(0); \
-    int gidy = get_global_id(1); \
-    gidx += (int4)(0, 1, 2, 3); \
- \
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \
-    int4 sx = fx & 0xffff8000;  \
-    int fy, sy; \
-    fx -= sx; \
-    sx = sx >> 15; \
-    fx = (fx +(1 << 4)) >> 5; \
- \
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \
-    sy = fy & 0xffff8000;  \
-    fy -= sy; \
-    sy = sy >> 15; \
- \
-    sy = sy < 0 ? 0 : sy; \
-    fy = fy < 0 ? 0 : fy; \
- \
-    fy = (fy + (1<< 4)) >> 5; \
-    sx += (*xOffset); \
-    sy += (*yOffset); \
-    int2 srcPos = (int2)(sx.x, sy); \
- \
-    vxc_uchar16 Y, U, V; \
-    vxc_int4 C0, C1, C2, C3; \
-    vxc_uchar16 R, G, B; \
- \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
- \
-    srcPos.x = sx.y; \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
- \
-    srcPos.x = sx.z; \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
- \
-    srcPos.x = sx.w; \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
- \
-    int tmpV = -56992; \
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
- \
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \
- \
-    ushort tmpG = 34784; \
-    vxc_ushort8 tmpDstG, tmpDstG1; \
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
- \
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \
-    tmpV = -70688; \
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
- \
-    int4 result, temp1, temp2, dstR, dstG, dstB; \
-    int4 tmpData0, tmpData1; \
- \
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
-    temp1 = fx * tmpData0 + tmpData1; \
- \
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
-    temp2 = fx * tmpData0 + tmpData1; \
-    result = fy * temp2 + (temp1 << 10); \
- \
-    tmpV = 1 << 19; \
-    dst_type dst, tmpPack; \
-    float4 tmpDst; \
- \
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - bMean) * var; \
-    dstB = convert_int4_rte(tmpDst * outputScale + zp); \
- \
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
-    temp1 = fx * tmpData0 + tmpData1; \
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
-    temp2 = fx * tmpData0 + tmpData1; \
-    result = fy * temp2 + (temp1 << 10); \
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - gMean) * var; \
-    dstG = convert_int4_rte(tmpDst * outputScale + zp); \
- \
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
-    temp1 = fx * tmpData0 + tmpData1; \
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
-    temp2 = fx * tmpData0 + tmpData1; \
-    result = fy * temp2 + (temp1 << 10); \
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - rMean) * var; \
-    dstR = convert_int4_rte(tmpDst * outputScale + zp); \
- \
-    if(bOrder == 2) \
-    { \
-        int4 exchangeData = dstB; \
-        dstB = dstR; \
-        dstR = exchangeData; \
-    } \
- \
-    VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); \
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); \
- \
-    int2 dstPos = (int2)(get_global_id(0) * 3, gidy); \
-    VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \
-}
-IMAGE_PRE_PROCESS_YUV444_TRANS(U8, vxc_uchar16)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx
@ -28,37 +28,34 @@ __kernel void resize_bilinear_BF16toBF16_DOWN
    float  top_y_f      = floor(in_y);
    float  y_lerp       = in_y - top_y_f;
    int    top_y_idx    = convert_int(top_y_f);
-    int    bottom_y_idx = top_y_idx + 1;
    vxc_short8 top;
    vxc_short8 bottom;
    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(top, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));

    vxc_ushort8 src;
    float4 left4;
@ -84,7 +81,14 @@ __kernel void resize_bilinear_BF16toBF16_DOWN
    vxc_ushort8 tmp, dst;
    _viv_asm(COPY, tmp, dst4, 16);
    dst.s0123 = tmp.s1357;
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 __kernel void resize_bilinear_BF16toBF16_UP
@ -107,22 +111,24 @@ __kernel void resize_bilinear_BF16toBF16_UP
    float  top_y_f     = floor(in_y);
    float  y_lerp      = in_y - top_y_f;
    int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
    vxc_ushort8 src0, src1, src2, src3, dst0, dst1;
    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

-    coord_in.y = bottom_y_idx;
-    VXC_ReadImage2DArray(src2, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

    vxc_ushort8 bitextract_p0;
    vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
@ -132,29 +138,36 @@ __kernel void resize_bilinear_BF16toBF16_UP
    VXC_DP2x8(maskShift, bitextract_p0, constData, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);

-    do
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 left4;
+    float4 right4;
+    float4 top4;
+    float4 bottom4;
+
+    int loop = depth - 1;
+    while (coord_in.z < loop)
    {
        VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

-        coord_in.z ++;
-        coord_in.y = top_y_idx;
-        VXC_ReadImage2DArray(src0, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src1, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

-        coord_in.y = bottom_y_idx;
-        VXC_ReadImage2DArray(src2, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src3, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.w += input_desc.s4;
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.z ++;

        vxc_ushort8 dst_tmp;
-        float4 left4;
-        float4 right4;
-        float4 top4;
-        float4 bottom4;
+

        VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
        _viv_asm(COPY, left4, dst_tmp, 16);
@ -176,7 +189,30 @@ __kernel void resize_bilinear_BF16toBF16_UP
        vxc_ushort8 tmp, dst;
        _viv_asm(COPY, tmp, dst4, 16);
        dst.s0123 = tmp.s1357;
-        VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
-        coord_out.z ++;
-    } while (coord_in.z < depth);
+
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_out.w += output_desc.s4;
+    }
+
+    VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    vxc_ushort8 dst_tmp;
+    VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, left4, dst_tmp, 16);
+    VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, right4, dst_tmp, 16);
+    right4     -= left4;
+    top4        = right4 * x_lerp + left4;
+    VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, left4, dst_tmp, 16);
+    VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, right4, dst_tmp, 16);
+    right4     -= left4;
+    bottom4     = right4 * x_lerp + left4;
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+    vxc_ushort8 tmp, dst;
+    _viv_asm(COPY, tmp, dst4, 16);
+    dst.s0123 = tmp.s1357;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx
@ -1,7 +1,7 @@
 #include "cl_viv_vx_ext.h"

 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;
-_viv_uniform VXC_512Bits uniFp16toFp32_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_left_4x4;
 _viv_uniform VXC_512Bits uniRightSubLeft_4x4;
 _viv_uniform VXC_512Bits uniExtactHalf8_2x8;
 _viv_uniform float2 scale_xy;
@ -27,94 +27,66 @@ __kernel void resize_bilinear_F16toF16_DOWN
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
    float  top_y_f     = floor(in_y);
    float  y_lerp      = in_y - top_y_f;
    int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-    vxc_short8 top_left0, top_right0;
-    vxc_short8 bottom_left0, bottom_right0;
-    vxc_half8 top_left, top_right;
-    vxc_half8 bottom_left, bottom_right;
+    vxc_short8 top_short, bottom_short, dst;
+    vxc_half8  top, bottom, result;
    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, top_left, top_left0, 16);
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, top_right, top_right0, 16);
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, bottom_left, bottom_left0, 16);
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, bottom_right, bottom_right0, 16);
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, top,    top_short, 16);
+    _viv_asm(COPY, bottom, bottom_short, 16);

    float4 left4;
    float4 right4;
    float4 top4;
    float4 bottom4;

-    VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-    VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
-    top4        = right4 * x_lerp + left4;
-    VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-    VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
-    bottom4      = right4 * x_lerp + left4;
-    bottom4     -= top4;
-    float4 dst4  = bottom4 * y_lerp + top4;
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    top4       = right4 * x_lerp + left4;
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    bottom4    = right4 * x_lerp + left4;
+    bottom4   -= top4;
+    float4 dst4       = bottom4 * y_lerp + top4;
+
    half4 tmp;
    _viv_asm(CONV, tmp, dst4);
-    VXC_DP2x8(top_left, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
-    _viv_asm(COPY, top_left0, top_left, 16);
-    VXC_WriteImage2DArray(output, coord_out, top_left0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    VXC_DP2x8(result, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+    _viv_asm(COPY, dst, result, 16);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 __kernel void resize_bilinear_F16toU8_DOWN
@ -131,84 +103,50 @@ __kernel void resize_bilinear_F16toU8_DOWN
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
    float  top_y_f     = floor(in_y);
    float  y_lerp      = in_y - top_y_f;
    int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-    vxc_short8 top_left0, top_right0;
-    vxc_short8 bottom_left0, bottom_right0;
-    vxc_half8 top_left, top_right;
-    vxc_half8 bottom_left, bottom_right;
+
+    vxc_short8 top_short, bottom_short;
+    vxc_half8  top, bottom;
    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, top_left, top_left0, 16);
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, top,    top_short, 16);
+    _viv_asm(COPY, bottom, bottom_short, 16);

-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, top_right, top_right0, 16);
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, bottom_left, bottom_left0, 16);
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, bottom_right, bottom_right0, 16);
    float4 left4;
    float4 right4;
    float4 top4;
    float4 bottom4;
-    VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-    VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
    top4        = right4 * x_lerp + left4;
-    VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-    VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
    bottom4      = right4 * x_lerp + left4;
    bottom4     -= top4;
    float4 dst4  = bottom4 * y_lerp + top4;
@ -216,7 +154,14 @@ __kernel void resize_bilinear_F16toU8_DOWN
    int4 dst     = convert_int4_rte(dst4);
    vxc_uchar8 dst_uchar;
    VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-    VXC_WriteImage2DArray(output, coord_out, dst_uchar, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_uchar,
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 __kernel void resize_bilinear_F16toF16_UP
@ -239,24 +184,26 @@ __kernel void resize_bilinear_F16toF16_UP
    float  top_y_f     = floor(in_y);
    float  y_lerp      = in_y - top_y_f;
    int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
+

    vxc_ushort8 src0, src1, src2, src3, dst0, dst1;
    vxc_half8 top;
    vxc_half8 bottom;
    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);

-    coord_in.y = bottom_y_idx;
-    VXC_ReadImage2DArray(src2, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

    vxc_ushort8 bitextract_p0;
    vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
@ -266,32 +213,41 @@ __kernel void resize_bilinear_F16toF16_UP
    VXC_DP2x8(maskShift, bitextract_p0, constData, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);

-    do
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 left4;
+    float4 right4;
+    float4 top4;
+    float4 bottom4;
+
+    int loop = depth - 1;
+    while (coord_in.z < loop)
    {
        VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        _viv_asm(COPY, top, dst0, 16);
        _viv_asm(COPY, bottom, dst1, 16);
+
+
+        coord_in.w += input_desc.s4;
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        coord_in.z ++;
-        coord_in.y = top_y_idx;
-        VXC_ReadImage2DArray(src0, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src1, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_in.y = bottom_y_idx;
-        VXC_ReadImage2DArray(src2, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src3, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        float4 left4;
-        float4 right4;
-        float4 top4;
-        float4 bottom4;
-        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
        top4        = right4 * x_lerp + left4;
-        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
        bottom4      = right4 * x_lerp + left4;
        bottom4     -= top4;
        float4 dst4  = bottom4 * y_lerp + top4;
@ -299,7 +255,28 @@ __kernel void resize_bilinear_F16toF16_UP
        _viv_asm(CONV, tmp, dst4);
        VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
        _viv_asm(COPY, dst0, top, 16);
-        VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
-        coord_out.z ++;
-    } while (coord_in.z < depth);
+
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_out.w += output_desc.s4;
+    }
+
+    VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, top, dst0, 16);
+    _viv_asm(COPY, bottom, dst1, 16);
+
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    top4        = right4 * x_lerp + left4;
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    bottom4      = right4 * x_lerp + left4;
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+    half4 tmp;
+    _viv_asm(CONV, tmp, dst4);
+    VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+    _viv_asm(COPY, dst0, top, 16);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx
@ -5,8 +5,8 @@ _viv_uniform float2 scale_xy;
 _viv_uniform int depth;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
 _viv_uniform float dfpScale;
 _viv_uniform float half_pixel_value;

@ -34,8 +34,6 @@ __kernel void resize_bilinear_I16toI16_UP
    float  top_y_f     = floor(in_y);
    float  y_lerp      = in_y - top_y_f;
    int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);

    vxc_ushort8 src0, src1, src2, src3, dst0, dst1;

@ -44,16 +42,19 @@ __kernel void resize_bilinear_I16toI16_UP

    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);

-    coord_in.y = bottom_y_idx;
-    VXC_ReadImage2DArray(src2, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

    vxc_ushort8 bitextract_p0;
    vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
@ -63,39 +64,42 @@ __kernel void resize_bilinear_I16toI16_UP
    VXC_DP2x8(maskShift, bitextract_p0, constData, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);

-    do
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 left4;
+    float4 right4;
+    float4 top4;
+    float4 bottom4;
+
+    int loop = depth - 1;
+    while (coord_in.z < loop)
    {
        VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        _viv_asm(COPY, top, dst0, 16);
        _viv_asm(COPY, bottom, dst1, 16);
-
-        float4 left4;
-        float4 right4;
-        float4 top4;
-        float4 bottom4;
-
+        coord_in.w += input_desc.s4;
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        coord_in.z ++;
-        coord_in.y = top_y_idx;
-        VXC_ReadImage2DArray(src0, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src1, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

-        coord_in.y = bottom_y_idx;
-        VXC_ReadImage2DArray(src2, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src3, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
        top4        = right4 * x_lerp + left4;

        VXC_DP4x4(left4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
        VXC_DP4x4(right4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
        bottom4      = right4 * x_lerp + left4;
        bottom4     -= top4;
        float4 dst4  = bottom4 * y_lerp + top4;
@ -103,10 +107,30 @@ __kernel void resize_bilinear_I16toI16_UP
        int4 dst     = convert_int4_rte(dst4);

        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-        VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));

-        coord_out.z ++;
-    } while (coord_in.z < depth);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_out.w += output_desc.s4;
+    }
+
+    VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, top, dst0, 16);
+    _viv_asm(COPY, bottom, dst1, 16);
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    top4        = right4 * x_lerp + left4;
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    bottom4      = right4 * x_lerp + left4;
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+    dst4         = dst4 * dfpScale;
+    int4 dst     = convert_int4_rte(dst4);
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
 }

 __kernel void resize_bilinear_I16toI16_DOWN
@ -125,103 +149,67 @@ __kernel void resize_bilinear_I16toI16_DOWN
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
-
    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
-
    float  top_y_f     = floor(in_y);
    float  y_lerp      = in_y - top_y_f;
    int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-
-    vxc_short8 top_left, top_right;
-    vxc_short8 bottom_left, bottom_right;

+    vxc_short8 top, bottom, result;
    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));

    float4 left4;
    float4 right4;
    float4 top4;
    float4 bottom4;

-    VXC_DP4x4(left4, top_left, top_left, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-    VXC_DP4x4(right4, top_right, top_right, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, top, top, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
    top4        = right4 * x_lerp + left4;

-    VXC_DP4x4(left4, bottom_left, bottom_left, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-    VXC_DP4x4(right4, bottom_right, bottom_right, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
    bottom4      = right4 * x_lerp + left4;
-
    bottom4     -= top4;
    float4 dst4  = bottom4 * y_lerp + top4;
-
    dst4         = dst4 * dfpScale;
-
    int4 dst     = convert_int4_rte(dst4);

-    VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-    VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx
@ -5,8 +5,8 @@ _viv_uniform float2 scale_xy;
 _viv_uniform int depth;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
 _viv_uniform float dfpScale;
 _viv_uniform float half_pixel_value;

@ -34,8 +34,6 @@ __kernel void resize_bilinear_I8toI8_UP
    float  top_y_f     = floor(in_y);
    float  y_lerp      = in_y - top_y_f;
    int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);

    vxc_uchar16 src0, src1, dst0, dst1;

@ -44,12 +42,15 @@ __kernel void resize_bilinear_I8toI8_UP

    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);

-    coord_in.y = bottom_y_idx;
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));

    vxc_ushort8 bitextract_p0;
    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
@ -59,37 +60,42 @@ __kernel void resize_bilinear_I8toI8_UP
    VXC_DP2x8(maskShift, bitextract_p0, constData, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);

-    do
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 left4;
+    float4 right4;
+    float4 top4;
+    float4 bottom4;
+
+    int loop = depth - 1;
+    while (coord_in.z < loop)
    {
        VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        _viv_asm(COPY, top, dst0, 16);
        _viv_asm(COPY, bottom, dst1, 16);

+        coord_in.w += input_desc.s4;
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_in.z ++;
-        coord_in.y = top_y_idx;
-        VXC_ReadImage2DArray(src0, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        coord_in.y = bottom_y_idx;
-        VXC_ReadImage2DArray(src1, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-        float4 left4;
-        float4 right4;
-        float4 top4;
-        float4 bottom4;

        VXC_DP4x4(left4, top, top, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
        VXC_DP4x4(right4, top, top, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);

        top4        = right4 * x_lerp + left4;

        VXC_DP4x4(left4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
        VXC_DP4x4(right4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);

        bottom4      = right4 * x_lerp + left4;
        bottom4     -= top4;
@ -97,10 +103,31 @@ __kernel void resize_bilinear_I8toI8_UP
        dst4         = dst4 * dfpScale;
        int4 dst     = convert_int4_rte(dst4);
        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-        VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));

-        coord_out.z ++;
-    } while (coord_in.z < depth);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_out.w += output_desc.s4;
+    }
+
+    VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, top, dst0, 16);
+    _viv_asm(COPY, bottom, dst1, 16);
+    VXC_DP4x4(left4, top, top, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    top4        = right4 * x_lerp + left4;
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    bottom4      = right4 * x_lerp + left4;
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+    dst4         = dst4 * dfpScale;
+    int4 dst     = convert_int4_rte(dst4);
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 __kernel void resize_bilinear_I8toI8_DOWN
@ -112,98 +139,55 @@ __kernel void resize_bilinear_I8toI8_DOWN
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;
-
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
-
    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
-
    float  top_y_f     = floor(in_y);
    float  y_lerp      = in_y - top_y_f;
    int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-
-    vxc_char16 top_left, top_right;
-    vxc_char16 bottom_left, bottom_right;
-
+    vxc_char16 top, bottom, result;
    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));

    float4 left4;
    float4 right4;
    float4 top4;
    float4 bottom4;

-    VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-    VXC_DP4x4(right4, top_right, top_right, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
    top4        = right4 * x_lerp + left4;

-    VXC_DP4x4(left4, bottom_left, bottom_left, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-    VXC_DP4x4(right4, bottom_right, bottom_right, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
    bottom4      = right4 * x_lerp + left4;

    bottom4     -= top4;
@ -213,6 +197,11 @@ __kernel void resize_bilinear_I8toI8_DOWN

    int4 dst     = convert_int4_rte(dst4);

-    VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-    VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx
@ -1,13 +1,13 @@
 #include "cl_viv_vx_ext.h"

-_viv_uniform VXC_512Bits uniU8SubZPtoFp32_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;
 _viv_uniform float2 scale_xy;
 _viv_uniform int depth;
 _viv_uniform int input_ZP;
 _viv_uniform float uint8Scale;
 _viv_uniform float output_ZP;
-_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
 _viv_uniform float half_pixel_value;
@ -26,69 +26,36 @@ __kernel void resize_bilinear_U8toF16_DOWN
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
    float  top_y_f     = floor(in_y);
    float  y_lerp      = in_y - top_y_f;
    int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-    vxc_uchar16 top_left, top_right;
-    vxc_uchar16 bottom_left, bottom_right;
+    vxc_uchar16 top, bottom;
    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));

    float4 left4;
    float4 right4;
@ -97,16 +64,12 @@ __kernel void resize_bilinear_U8toF16_DOWN

    unsigned char inputZP;
    _viv_asm(COPY, inputZP, input_ZP, 4);
-    VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-    VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
    top4        = right4 * x_lerp + left4;

-    VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-    VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
    bottom4      = right4 * x_lerp + left4;

    bottom4     -= top4;
@ -120,7 +83,12 @@ __kernel void resize_bilinear_U8toF16_DOWN
    vxc_short8 dst_short;
    _viv_asm(COPY, dst_short, dst, 16);

-    VXC_WriteImage2DArray(output, coord_out, dst_short.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short.s0246,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 __kernel void resize_bilinear_U8toU8_UP
@ -147,8 +115,6 @@ __kernel void resize_bilinear_U8toU8_UP
    float  top_y_f     = floor(in_y);
    float  y_lerp      = in_y - top_y_f;
    int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);

    vxc_uchar16 src0, src1;

@ -157,12 +123,15 @@ __kernel void resize_bilinear_U8toU8_UP

    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);

-    coord_in.y = bottom_y_idx;
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));

    vxc_ushort8 bitextract_p0;
    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
@ -172,46 +141,67 @@ __kernel void resize_bilinear_U8toU8_UP
    VXC_DP2x8(maskShift, bitextract_p0, constData, \
    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);

-    do
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 left4;
+    float4 right4;
+    float4 top4;
+    float4 bottom4;
+
+    int loop = depth - 1;
+    while (coord_in.z < loop)
    {
        VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

+        coord_in.w += input_desc.s4;
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_in.z ++;
-        coord_in.y = top_y_idx;
-        VXC_ReadImage2DArray(src0, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        coord_in.y = bottom_y_idx;
-        VXC_ReadImage2DArray(src1, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-        float4 left4;
-        float4 right4;
-        float4 top4;
-        float4 bottom4;
-
        unsigned char inputZP;
        _viv_asm(COPY, inputZP, input_ZP, 4);
-        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
-
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
        top4        = right4 * x_lerp + left4;

        VXC_DP4x4(left4, bottom, inputZP, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
        VXC_DP4x4(right4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
-
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
        bottom4      = right4 * x_lerp + left4;
        bottom4     -= top4;
        float4 dst4  = bottom4 * y_lerp + top4;
        dst4         = dst4 * uint8Scale + output_ZP;
        int4 dst     = convert_int4_rte(dst4);
        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-        VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_out.w += output_desc.s4;
+    }

-        coord_out.z ++;
-    } while (coord_in.z < depth);
+    VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    unsigned char inputZP;
+    _viv_asm(COPY, inputZP, input_ZP, 4);
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    top4        = right4 * x_lerp + left4;
+
+    VXC_DP4x4(left4, bottom, inputZP, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    bottom4      = right4 * x_lerp + left4;
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+    dst4         = dst4 * uint8Scale + output_ZP;
+    int4 dst     = convert_int4_rte(dst4);
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 __kernel void resize_bilinear_U8toU8_DOWN
@ -228,69 +218,36 @@ __kernel void resize_bilinear_U8toU8_DOWN
    float4 left_x_f    = floor(in_x);
    float4 x_lerp      = in_x - left_x_f;
    int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
    float  top_y_f     = floor(in_y);
    float  y_lerp      = in_y - top_y_f;
    int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-    vxc_uchar16 top_left, top_right;
-    vxc_uchar16 bottom_left, bottom_right;
+    vxc_uchar16 top, bottom, result;
    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));

    float4 left4;
    float4 right4;
@ -299,25 +256,26 @@ __kernel void resize_bilinear_U8toU8_DOWN

    unsigned char inputZP;
    _viv_asm(COPY, inputZP, input_ZP, 4);
-    VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-    VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
    top4        = right4 * x_lerp + left4;

-    VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-    VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
    bottom4      = right4 * x_lerp + left4;

    bottom4     -= top4;
    float4 dst4  = bottom4 * y_lerp + top4;

    dst4         = dst4 * uint8Scale + output_ZP;
-
    int4 dst     = convert_int4_rte(dst4);

-    VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-    VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx
@ -69,7 +69,8 @@ __kernel void resize_bilinear_U8toU8_UP_opt
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);

-    do
+    int loop = depth - 1;
+    while (coord_in.z < loop)
    {
        VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
@ -88,8 +89,17 @@ __kernel void resize_bilinear_U8toU8_UP_opt
                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
        coord_out.w += output_desc.s4;

-        coord_out.z ++;
-    } while (coord_out.z < depth);
+        coord_in.z ++;
+    }
+
+    VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
+    vxc_uchar16 dst;
+    VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom,
+            VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+            VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
 }

 #endif
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx
@ -28,18 +28,30 @@ __kernel void resize_nearest_F16toF16
    vxc_short8 src;
    int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.y;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.z;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.w;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 _viv_uniform VXC_512Bits uniGetExtractData_2x8;
@ -56,18 +68,29 @@ __kernel void resize_nearest_F16toF16_op
    vxc_ushort8 src0, src1, dst;
    int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    //in_x_idx = in_x_idx - in_x_idx.xxxx;
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
    vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);
    vxc_ushort8 input_idx;
    _viv_asm(COPY, input_idx, in_x_idx, 16);
    VXC_DP2x8(mask, input_idx, input_idx, \
    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);
    VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 _viv_uniform VXC_512Bits uniConvertI8toI8_2x8;
@ -84,19 +107,31 @@ __kernel void resize_nearest_I8toI8
    vxc_char16 src;
    int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.y;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.z;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.w;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
    VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 __kernel void resize_nearest_I8toI8_op
@ -113,8 +148,14 @@ __kernel void resize_nearest_I8toI8_op
    vxc_char16 dst;
    int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
    vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);
    vxc_ushort8 input_idx;
    _viv_asm(COPY, input_idx, in_x_idx, 16);
@ -123,7 +164,13 @@ __kernel void resize_nearest_I8toI8_op
    VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, dst, dst0, 8);
    VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 __kernel void resize_nearest_U8toU8
@ -139,22 +186,34 @@ __kernel void resize_nearest_U8toU8
    vxc_uchar16 src;
    int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.y;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.z;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.w;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
    vxc_ushort8 multiplier;
    _viv_asm(COPY, multiplier, multAndoutZP, 16);
    VXC_DP2x8(src, src, multiplier, \
    VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 __kernel void resize_nearest_U8toU8_op
@ -170,8 +229,14 @@ __kernel void resize_nearest_U8toU8_op
    vxc_uchar16 src0, dst;
    int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
    vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);
    vxc_ushort8 input_idx;
    _viv_asm(COPY, input_idx, in_x_idx, 16);
@ -180,7 +245,13 @@ __kernel void resize_nearest_U8toU8_op
    vxc_ushort8 multiplier;
    _viv_asm(COPY, multiplier, multAndoutZP, 16);
    VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 __kernel void resize_nearest_I16toI16
@ -196,19 +267,32 @@ __kernel void resize_nearest_I16toI16
    vxc_short8 src;
    int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);

-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.y;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.z;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
    coord_in.x = in_x_idx.w;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
    VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }

 __kernel void resize_nearest_I16toI16_op
@ -224,10 +308,16 @@ __kernel void resize_nearest_I16toI16_op
    vxc_ushort8 src0, src1, dst0;
    vxc_short8 dst;
    int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));

    //in_x_idx = in_x_idx - in_x_idx.xxxx;
    vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);
@ -237,5 +327,11 @@ __kernel void resize_nearest_I16toI16_op
    VXC_BitExtract(dst0, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, dst, dst0, 8);
    VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/space2depth_internal.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/space2depth_internal.vx
@ -0,0 +1,135 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniExtractEvenUint8Stride2_2x8;
+_viv_uniform VXC_512Bits uniExtractOddUint8Stride2_2x8;
+_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4;
+_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4;
+
+_viv_uniform int input_depth;
+
+#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT(src0_type_name, src1_type_name, read_type) \
+__kernel void space2depth_internal_##src0_type_name##to##src1_type_name( \
+    image2d_array_t input, \
+    image2d_array_t output, \
+    int block_size_x, \
+    int block_size_y \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    read_type src; \
+    VXC_ReadImage2DArray(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ \
+    ushort stride_x = (ushort)block_size_x; \
+    ushort stride_y = (ushort)block_size_y; \
+    ushort sidx = (ushort)gidx; \
+    ushort sidy = (ushort)gidy; \
+    ushort tmpX = sidx % stride_x; \
+    ushort tmpY = sidy % stride_y; \
+    int tmpId0 = tmpX; \
+    int tmpId1 = tmpY; \
+    int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0); \
+    coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz; \
+    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+}
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT(U8, U8, vxc_uchar16)
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I8, I8, vxc_char16)
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I16, I16, vxc_short8)
+
+__kernel void space2depth_internal_F16toF16(
+    image2d_array_t input,
+    image2d_array_t output,
+    int block_size_x,
+    int block_size_y
+    )
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    vxc_short8 data, imgVal0;
+    VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    ushort stride_x = (ushort)block_size_x;
+    ushort stride_y = (ushort)block_size_y;
+    ushort sidx = (ushort)gidx;
+    ushort sidy = (ushort)gidy;
+    ushort tmpX = sidx % stride_x;
+    ushort tmpY = sidy % stride_y;
+    int tmpId0 = tmpX;
+    int tmpId1 = tmpY;
+    int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0);
+    coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz;
+
+    VXC_WriteImage2DArray(output, coord_out, data, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \
+__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \
+    image2d_array_t input, \
+    image2d_array_t output, \
+    int block_size_x, \
+    int block_size_y \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+ \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \
+    int out_d1; \
+    read_type imageData; \
+    write_type  imgVal0, imgVal1; \
+ \
+    VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                     VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    out_d1 = gidz + input_depth; \
+ \
+    VXC_DP2x8(imgVal0, imageData, imageData,\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractEvenUint8Stride2_2x8); \
+    VXC_DP2x8(imgVal1, imageData, imageData,\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddUint8Stride2_2x8); \
+    VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord_out.z = out_d1; \
+    VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(U8, U8, vxc_uchar16, vxc_uchar16)
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(I8, I8, vxc_char16, vxc_char16)
+
+#define SPACE2DEPTH_INTERNAL_16BITS_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \
+__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \
+    image2d_array_t input, \
+    image2d_array_t output, \
+    int block_size_x, \
+    int block_size_y \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+ \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \
+    int out_d1; \
+    read_type imageData; \
+    write_type  imgVal0, imgVal1; \
+ \
+    VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    out_d1 = gidz + input_depth; \
+    VXC_DP4x4(imgVal0, imageData, imageData, \
+                 VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractEvenFp16Stride2_4x4); \
+    VXC_DP4x4(imgVal1, imageData, imageData, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractOddFp16Stride2_4x4); \
+    VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_out.z = out_d1; \
+    VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+SPACE2DEPTH_INTERNAL_16BITS_X2Y1(I16, I16, vxc_short8, vxc_short8)
+SPACE2DEPTH_INTERNAL_16BITS_X2Y1(F16, F16, vxc_short8, vxc_short8)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx
@ -0,0 +1,58 @@
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvertDatatoF32_4x4;
+_viv_uniform float output_scale;
+_viv_uniform float tail;
+
+#define UPSAMPLE_SCALETO_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type, conv_func) \
+    __kernel void upsamplescale_##src_name##to##dst_name( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 int              stride, \
+                 float            scale) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    read_type  read_val; \
+    src_type   src_val; \
+    dst_type   dst_val; \
+    write_type write_val; \
+    VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src_val, read_val, 16); \
+    coord.xy *= stride; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.w, baseAddr); \
+    float4 data; \
+    VXC_DP4x4(data, src_val, src_val, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertDatatoF32_4x4); \
+    data = data * output_scale + tail; \
+    _viv_asm(conv_func, dst_val, data); \
+    _viv_asm(COPY, write_val, dst_val, 16); \
+    int4 coord_out = coord; \
+    for (int y = 0; y < stride; y++) \
+    { \
+        coord_out.x = coord.x; \
+        for (int x = 0; x < stride; ) \
+        { \
+            VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, write_val, \
+                VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); \
+            x++; \
+            coord_out.x ++; \
+        } \
+        coord_out.y ++; \
+    } \
+}
+
+UPSAMPLE_SCALETO_FUN(F16, F16,  vxc_short8,  vxc_half8,   half4,  short4, CONV)
+UPSAMPLE_SCALETO_FUN(F16, I16,  vxc_short8,  vxc_half8,   int4,   short4, CONV_RTE)
+UPSAMPLE_SCALETO_FUN(F16, I8,   vxc_short8,  vxc_half8,   int4,   char4,  CONV_RTE)
+UPSAMPLE_SCALETO_FUN(F16, U8,   vxc_short8,  vxc_half8,   int4,   uchar4, CONV_RTE)
+UPSAMPLE_SCALETO_FUN(I16, I16,  vxc_short8,  vxc_short8,  int4,   short4, CONV_RTE)
+UPSAMPLE_SCALETO_FUN(I16, F16,  vxc_short8,  vxc_short8,  half4,  short4, CONV)
+UPSAMPLE_SCALETO_FUN(I8,  I8,   vxc_char16,  vxc_char16,  int4,   char4,  CONV_RTE)
+UPSAMPLE_SCALETO_FUN(I8,  F16,  vxc_short8,  vxc_short8,  half4,  short4, CONV)
+UPSAMPLE_SCALETO_FUN(U8,  U8,   vxc_uchar16, vxc_uchar16, int4,   uchar4, CONV_RTE)
+UPSAMPLE_SCALETO_FUN(U8,  F16,  vxc_short8,  vxc_short8,  half4,  short4, CONV)
+
--- a/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx
@ -0,0 +1,83 @@
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniUpScale2X_lo_2x8;
+_viv_uniform VXC_512Bits uniUpScale2X_hi_2x8;
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
+
+#define UPSAMPLE_SCALETO8B_FUN(src_name, dst_name, read_type, src_type, dst_type) \
+    __kernel void upsamplescale_##src_name##to##dst_name##_K2( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 int              stride, \
+                 float            scale) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    read_type  read_val; \
+    src_type   src_val; \
+    dst_type   dst_val; \
+    VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src_val, read_val, 16); \
+    coord.xy <<= 1; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.w, baseAddr); \
+    vxc_ushort8 multiplier; \
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+    VXC_DP2x8(dst_val, src_val, multiplier, \
+          VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_lo_2x8); \
+    VXC_DP2x8(dst_val, src_val, multiplier, \
+          VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_hi_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+    coord.y ++; \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+}
+
+UPSAMPLE_SCALETO8B_FUN(F16, I8,  vxc_short8,  vxc_half8,   vxc_char16)
+UPSAMPLE_SCALETO8B_FUN(F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16)
+UPSAMPLE_SCALETO8B_FUN(I8,  I8,  vxc_char16,  vxc_char16,  vxc_char16)
+UPSAMPLE_SCALETO8B_FUN(U8,  U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16)
+
+#define UPSAMPLE_SCALETO16B_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type) \
+    __kernel void upsamplescale_##src_name##to##dst_name##_K2( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 int              stride, \
+                 float            scale) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    read_type  read_val; \
+    src_type   src_val; \
+    dst_type   dst0_val; \
+    dst_type   dst1_val; \
+    write_type write_val; \
+    VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src_val, read_val, 16); \
+    coord.xy <<= 1; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.w, baseAddr); \
+    vxc_ushort8 multiplier; \
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+    VXC_DP2x8(dst0_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_lo_2x8); \
+    VXC_DP2x8(dst1_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_hi_2x8); \
+    _viv_asm(COPY, write_val, dst0_val, 16); \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    coord.y ++; \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, write_val, dst1_val, 16); \
+    coord.xy = coord.xy + (int2)(8, -1); \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    coord.y ++; \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+UPSAMPLE_SCALETO16B_FUN(F16, F16,  vxc_short8,  vxc_half8,   vxc_half8,  vxc_short8)
+UPSAMPLE_SCALETO16B_FUN(F16, I16,  vxc_short8,  vxc_half8,   vxc_short8, vxc_short8)
+UPSAMPLE_SCALETO16B_FUN(I8,  F16,  vxc_char16,  vxc_char16,  vxc_half8,  vxc_short8)
+UPSAMPLE_SCALETO16B_FUN(U8,  F16,  vxc_uchar16, vxc_uchar16, vxc_half8,  vxc_short8)
+UPSAMPLE_SCALETO16B_FUN(I16, F16,  vxc_short8,  vxc_short8,  vxc_half8,  vxc_short8)
+UPSAMPLE_SCALETO16B_FUN(I16, I16,  vxc_short8,  vxc_short8,  vxc_short8, vxc_short8)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx
@ -1,111 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-//-----------------------------------------------tensor crop-------------------------------
-__kernel void vxcTensorCrop_Int16(
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output,
-        int offset0,
-        int offset1,
-        int offset2)
-{
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    vxc_ushort8 src0, src1, src2, src3;
-
-    VXC_ReadImage2DArray(src0, input,  coord_in, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input,  coord_in, VXC_5BITOFFSET_XY(0, 1),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input,  coord_in, VXC_5BITOFFSET_XY(0, 2),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input,  coord_in, VXC_5BITOFFSET_XY(0, 3),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\
-        - offset1, get_global_id(2) - offset2, 0);
-
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void vxcTensorCrop_Int8(
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output,
-        int offset0,
-        int offset1,
-        int offset2)
-{
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar16 src0, src1, src2, src3;
-
-    VXC_ReadImage2DArray(src0, input,  coord_in, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input,  coord_in, VXC_5BITOFFSET_XY(0, 1),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input,  coord_in, VXC_5BITOFFSET_XY(0, 2),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input,  coord_in, VXC_5BITOFFSET_XY(0, 3),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1) - offset1,\
-        get_global_id(2) - offset2, 0);
-
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;
-
-__kernel void vxcTensorCrop_Int16_Fp16(
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output,
-        int offset0,
-        int offset1,
-        int offset2)
-{
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    vxc_short8 src0, src1, src2, src3;
-
-    VXC_ReadImage2DArray(src0, input,  coord_in, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input,  coord_in, VXC_5BITOFFSET_XY(0, 1),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input,  coord_in, VXC_5BITOFFSET_XY(0, 2),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input,  coord_in, VXC_5BITOFFSET_XY(0, 3),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\
-        - offset1, get_global_id(2) - offset2, 0);
-
-    vxc_half8 dst0, dst1, dst2, dst3;
-    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt16toFp16_2x8);
-    VXC_DP2x8(dst1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt16toFp16_2x8);
-    VXC_DP2x8(dst2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt16toFp16_2x8);
-    VXC_DP2x8(dst3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt16toFp16_2x8);
-
-    vxc_short8 out0, out1, out2, out3;
-    _viv_asm(COPY, out0, dst0, 16);
-    _viv_asm(COPY, out1, dst1, 16);
-    _viv_asm(COPY, out2, dst2, 16);
-    _viv_asm(COPY, out3, dst3, 16);
-
-    VXC_WriteImage2DArray(output, coord_out, out0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, out1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, out2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, out3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx
@ -1,63 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int loopNum;
-_viv_uniform VXC_512Bits uniMulAcc_16x1;
-__kernel void vsi_nn_kernel_fullconnect2(
-     __read_only image2d_array_t   input,
-     __read_only image2d_array_t   weight,
-     __read_only image2d_array_t   bias,
-     __write_only image2d_array_t  output)
-{
-    int4 coord_in = (int4)(16, get_global_id(0), get_global_id(1), 0);
-    int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 v0, v1, v2, v3, v4, v5, v6, v7;
-    vxc_half8 i0, i1, i2, i3;
-    vxc_half8 w0, w1, w2, w3;
-    float4 sum = 0;
-    float dst = 0;
-    dst = read_imagef(bias, coord_in.ywww).x;
-    do
-    {
-        VXC_ReadImage(v0, input,  coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, i0, v0, 16);
-        VXC_ReadImage(v1, weight, coord_in.xy, VXC_5BITOFFSET_XY(-16, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, w0, v1, 16);
-        VXC_ReadImage(v2, input,  coord_in.xz, VXC_5BITOFFSET_XY(-8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, i1, v2, 16);
-        VXC_ReadImage(v3, weight, coord_in.xy, VXC_5BITOFFSET_XY(-8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, w1, v3, 16);
-        VXC_ReadImage(v4, input,  coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, i2, v4, 16);
-        VXC_ReadImage(v5, weight, coord_in.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, w2, v5, 16);
-        VXC_ReadImage(v6, input,  coord_in.xz, VXC_5BITOFFSET_XY(8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, i3, v6, 16);
-        VXC_ReadImage(v7, weight, coord_in.xy, VXC_5BITOFFSET_XY(8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, w3, v7, 16);
-
-        coord_in.x += 32;
-
-        VXC_DP16x1(sum, i0, w0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
-        VXC_DP16x1(sum, i1, w1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
-        VXC_DP16x1(sum, i2, w2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
-        VXC_DP16x1(sum, i3, w3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
-
-        float4 tmp = {1, 1, 1, 1};
-        dst = dst + dot(sum, tmp);
-
-    } while (coord_in.x < loopNum);
-
-    vxc_half v;
-    _viv_asm(CONV, v, dst);
-    _viv_asm(COPY, v0, v, 16);
-    VXC_WriteImage(output, coord_out.xy, v0, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx
@ -1,129 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/*****************************layernorm uint8 to fp16****************************/
-_viv_uniform int width;
-_viv_uniform float dimRatio;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniSumU8_16x1;
-_viv_uniform VXC_512Bits uniSqrSum_16x1;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-_viv_uniform int sumInZp;
-_viv_uniform int tmpZp1;
-_viv_uniform int tmpZp2;
-_viv_uniform float e2InScale;
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
-_viv_uniform VXC_512Bits UniPackFP16even_2x8;
-
-__kernel void vxcLayerNormU8toFp16(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t output,
-              float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);
-    vxc_uchar16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0;
-    vxc_int4 tmpSum1;
-    vxc_int4 tmpSqr1;
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-        tmpSum += (tmpSum1.x);
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
-    }
-    sum = (tmpSum + sumInZp) * input_scale;
-    sqr = (tmpSqr + tmpZp2) * e2InScale;
-
-    float mean, vari;
-    mean = sum * dimRatio;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    int4 coord_bias = (int4)(0, 0, 0, 0);
-    vxc_half8 scale_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_short8 src1, outval;
-    short zp = inputZP;
-    half4 tmpVal0, tmpVal1;
-    vxc_half8 dst;
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        coord_bias.x = coord.x;
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert2ndUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert3rdUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert4thUint8SubZpToFp32_4x4);
-        tmpData0 *= input_scale;
-        tmpData1 *= input_scale;
-        tmpData2 *= input_scale;
-        tmpData3 *= input_scale;
-
-        vxc_float4 norm;
-        tmpData0 -= mean;
-        norm = scale_f0 * vari * tmpData0 + bias_f0;
-        bias_f0 = read_imagef(bias, coord_bias);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        coord_bias.x += 4;
-        _viv_asm(CONV, tmpVal0, norm);
-
-        tmpData1 -= mean;
-        norm = scale_f1 * vari * tmpData1 + bias_f1;
-        bias_f1 = read_imagef(bias, coord_bias);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-        _viv_asm(CONV, tmpVal1, norm);
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-            UniPackFP16even_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        int2 coord_out = (int2)(coord.x, coord.y);
-        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-        tmpData2 -= mean;
-        norm = scale_f0 * vari * tmpData2 + bias_f0;
-        _viv_asm(CONV, tmpVal0, norm);
-
-        tmpData3 -= mean;
-        norm = scale_f1 * vari * tmpData3 + bias_f1;
-        _viv_asm(CONV, tmpVal1, norm);
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-            UniPackFP16even_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        coord_out.x += 8;
-        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx
@ -1,38 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-//--------------------------resize-------------------------
-_viv_uniform VXC_512Bits uniPackEvenData_2x8;
-__kernel void resize_16bits_downsample_quarter
-    (
-    __read_only image2d_array_t input,
-    __write_only image2d_array_t output
-    )
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-    vxc_short8 src0, src1;
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(8, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    coord = coord >> 1;
-    VXC_DP2x8(src0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardInf, 0), uniPackEvenData_2x8);
-    VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void resize_8bits_downsample_quarter
-    (
-    __read_only image2d_array_t input,
-    __write_only image2d_array_t output
-    )
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-    vxc_char16 src0;
-    vxc_char8 dst;
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    coord = coord >> 1;
-    dst  = src0.s02468ace;
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx
@ -1,49 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-//--------------------------scale-------------------------
-_viv_uniform VXC_512Bits uniExtractHalf8_2x8;
-_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Lo_4x4;
-_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Hi_4x4;
-__kernel void scale_fp16
-    (
-    __read_only     image2d_array_t input,
-    __read_only     image2d_array_t weights,
-    __read_only     image2d_array_t biases,
-    __write_only    image2d_array_t output
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
-    vxc_short8 vec0, vec1;
-    vxc_half8  src0;
-    vxc_half8  w0;
-    vxc_float4 b0, b1;
-    vxc_float4 dst0, dst1;
-    VXC_ReadImage(vec0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage(vec1, weights, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, w0, vec1, 16);
-
-    coord.z = coord.x + 4;
-
-    b0 = read_imagef(biases, coord.xwww);
-    b1 = read_imagef(biases, coord.zwww);
-
-    VXC_DP4x4(dst0, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniFp16MulFp16ToFp32_Lo_4x4);
-    VXC_DP4x4(dst1, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniFp16MulFp16ToFp32_Hi_4x4);
-    dst0 += b0;
-    dst1 += b1;
-
-    half4 t0, t1;
-
-    _viv_asm(CONV, t0, dst0);
-    _viv_asm(CONV, t1, dst1);
-
-    VXC_DP2x8(w0, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);
-    _viv_asm(COPY, vec0, w0, 16);
-
-    VXC_WriteImage(output, coord.xy, vec0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx
@ -1,67 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/******************shuffle channel float16/int16********************/
-_viv_uniform int group_column;
-_viv_uniform float rgroup_column;
-
-__kernel void shuffleChannelVXC(
-    image2d_array_t input,
-    image2d_array_t output,
-    int group_number,
-    int axis)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    vxc_short8 src0, src1, src2, src3;
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    int coordz = coord.z;
-    int index_col = coordz * rgroup_column;
-    int index_row = coordz - index_col * group_column;
-    coord.z = index_row * group_number + index_col;
-    VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-/*****************shuffle channel int8/uint8****************************/
-
-__kernel void shuffleChannel8BitsVXC(
-    image2d_array_t input,
-    image2d_array_t output,
-    int group_number,
-    int axis)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    vxc_char16 src0, src1, src2, src3;
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    int coordz = coord.z;
-    int index_col = coordz * rgroup_column;
-    int index_row = coordz - index_col * group_column;
-    coord.z = index_row * group_number + index_col;
-    VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx
@ -1,65 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/******************shuffle channel float16/int16********************/
-_viv_uniform int group_column;
-_viv_uniform float rgroup_column;
-
-__kernel void shuffleChannel16Bits_Axis1(
-    image2d_array_t input,
-    image2d_array_t output,
-    int group_number,
-    int axis)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    int4 coord_out = coord;
-    vxc_short8 src0, src1, src2, src3;
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    int coordy = coord.y;
-    int index_col = coordy * rgroup_column;
-    int index_row = coordy - index_col * group_column;
-    coord_out.y = index_row * group_number + index_col;
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.x += 8;
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.x += 8;
-    VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.x += 8;
-    VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-/*****************shuffle channel int8/uint8****************************/
-
-__kernel void shuffleChannel8Bits_Axis1(
-    image2d_array_t input,
-    image2d_array_t output,
-    int group_number,
-    int axis)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    int4 coord_out = coord;
-    vxc_char16 src0, src1;
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    coord.x += 16;
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    int coordy = coord.y;
-    int index_col = coordy * rgroup_column;
-    int index_row = coordy - index_col * group_column;
-    coord_out.y = index_row * group_number + index_col;
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord_out.x += 16;
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx
@ -1,41 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4;
-_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4;
-_viv_uniform int input_depth;
-
-__kernel void vxcReorg2_fp16_fp16_sx2_sy1
-    (
-    image2d_array_t input,
-    image2d_array_t output,
-    int stridex,
-    int stridey
-    )
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-
-    int4 coord = (int4)(gidx, gidy, gidz, 0);
-    int4 coord_out = (int4)(gidx >> 1, gidy, 0, 0);
-    int out_d0, out_d1;
-    vxc_short8 imageData;
-    vxc_short8 imgVal0, imgVal1;
-    //int tmpw = gidz / input_depth; \n\
-    //int tmpz = gidz % input_depth; \n\
-
-    VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0),
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_DP4x4(imgVal0, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
-        uniExtractEvenFp16Stride2_4x4);
-    VXC_DP4x4(imgVal1, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
-        uniExtractOddFp16Stride2_4x4);
-
-    out_d0 = gidz * 2 * 1;
-    out_d1 = out_d0 + 1;
-
-    coord_out.z = out_d0;
-    VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-    coord_out.z = out_d1;
-    VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
--- a/src/tim/vx/internal/src/makefile.linux
+++ b/src/tim/vx/internal/src/makefile.linux
@ -10,8 +10,11 @@ CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")

 ################################################################################
 # Supply necessary libraries.
-
-LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC
+ifeq ($(USE_VXC_BINARY)$(USE_VSC_LITE),11)
+LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC_Lite -lGAL
+else
+LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC -lGAL
+endif
 LIBS += -lm -ldl

 #############################################################################
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
@ -219,7 +219,10 @@ static vsi_bool op_check
        IO_TYPE(D_F32, D_I32)
        IO_TYPE(D_F16, D_I32)
        IO_TYPE(D_I32, D_I32)
-        IO_TYPE(D_U8|Q_ASYM, D_I32)
+        IO_TYPE(D_I8|Q_DFP,   D_I32)
+        IO_TYPE(D_I8,         D_I32)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32)
+        IO_TYPE(D_U8,         D_I32)
    END_IO_TYPE_DECL(ARGMIN)
    if(!VALIDATE_OP_IO_TYPES(ARGMIN, self, inputs, self->input.num, outputs, self->output.num)) {
        char* desc = generate_op_io_types_desc(inputs,
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
@ -44,190 +44,6 @@
 #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
 #define _PARAM_NUM          (_ARG_NUM + _IO_NUM)

-#define USE_OVX_API TRUE
-
-#if (USE_OVX_API == FALSE)
-extern vx_kernel_description_t * vx_kernel_CROP_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_crop_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.crop);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, offset[0] );
-    _SET_PARAM( 1, VX_TYPE_INT32, offset[1] );
-    _SET_PARAM( 2, VX_TYPE_INT32, offset[2] );
-
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_pre_init
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_kernel_info_t * kernel_info
-    )
-{
-    vsi_nn_type_e dataFormat = inputs[0]->attr.dtype.vx_type;
-    vsi_nn_type_e dstFormat = outputs[0]->attr.dtype.vx_type;
-
-    if (dataFormat == VSI_NN_TYPE_FLOAT16
-        || (dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_INT16))
-    {
-        kernel_info->kernel_index = 1;
-    }
-    else if(dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        kernel_info->kernel_index = 3;
-    }
-    else
-    {
-        kernel_info->kernel_index = 2;
-    }
-
-    return VSI_SUCCESS;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_border_t border;
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    border.mode = VX_BORDER_REPLICATE;
-    border.constant_value.U32 = 0;
-    status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
-#endif
-
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
@ -236,7 +52,6 @@ static vsi_status op_compute
    )
 {
    vsi_status status = VSI_FAILURE;
-#if (USE_OVX_API == TRUE)
    vx_nn_stride_slice_params_t param;
    vsi_nn_tensor_t *begin_dims_tensor = NULL;
    vsi_nn_tensor_t *end_dims_tensor = NULL;
@ -317,36 +132,6 @@ static vsi_status op_compute
    {
        status = VSI_SUCCESS;
    }
-#else
-    vsi_nn_kernel_info_t kernel_info;
-
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_crop";
-    kernel_info.type = vsi_nn_GetVXKernelTypeForShader();
-    kernel_info.kernel = vx_kernel_CROP_list;
-    kernel_info.init_index = 1;
-
-    if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type))
-    {
-        vx_op_pre_init(self, inputs, outputs, &kernel_info);
-    }
-
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name) free(kernel_info.resource_name);
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
-#endif
 OnError:
    if (begin_dims_tensor) vsi_nn_ReleaseTensor(&begin_dims_tensor);
    if (end_dims_tensor) vsi_nn_ReleaseTensor(&end_dims_tensor);
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@ -221,6 +221,9 @@ static vsi_bool op_check
        IO_TYPE(D_BF16,   D_F32)
        IO_TYPE(D_I32,   D_I32)
        IO_TYPE(D_I32,   D_I16|Q_DFP)
+        IO_TYPE(D_I16, D_I16|Q_DFP)
+        IO_TYPE(D_I8,  D_I8|Q_DFP)
+        IO_TYPE(D_U8,  D_U8|Q_ASYM)
    END_IO_TYPE_DECL(DATACONVERT)
    if (!VALIDATE_OP_IO_TYPES(DATACONVERT, self, inputs, self->input.num, outputs, self->output.num))
    {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
@ -196,6 +196,7 @@ static vsi_bool op_check
        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP, D_I16|Q_DFP)
        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,  D_I32|Q_DFP, D_U8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC, D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC, D_I8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP)
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
@ -89,7 +89,11 @@ static vsi_bool op_check
        IO_TYPE(D_I32, D_F16,  D_F16)
        IO_TYPE(D_I32, D_F32,  D_F32)
        IO_TYPE(D_I32, D_I32,  D_I32)
+        IO_TYPE(D_I32, D_U8|Q_ASYM,  D_F32)
        IO_TYPE(D_I32, D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I32, D_U8|Q_ASYM,  D_I8|Q_DFP)
+        IO_TYPE(D_I32, D_U8|Q_ASYM,  D_I8)
+        IO_TYPE(D_F16, D_F16,  D_F16)
    END_IO_TYPE_DECL(EMBEDDING_LOOKUP)

    if (!VALIDATE_OP_IO_TYPES(EMBEDDING_LOOKUP, self, inputs, self->input.num, outputs, self->output.num))
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c
@ -42,215 +42,6 @@
 #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
 #define _PARAM_NUM          (_ARG_NUM + _IO_NUM)

-#define USE_OVX_API TRUE
-
-#if (USE_OVX_API == FALSE)
-extern vx_kernel_description_t * vx_kernel_FCL2_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_fcl_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.fcl);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, axis );
-    //_SET_PARAM( 1, VX_TYPE_FLOAT32, bias );
-    //_SET_PARAM( 2, VX_TYPE_TENSOR, data_bias );
-    //_SET_PARAM( 3, VX_TYPE_TENSOR, data_weight );
-    //_SET_PARAM( 4, VX_TYPE_FLOAT32, regularize );
-    _SET_PARAM( 1, VX_TYPE_INT32, weights );
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    uint32_t axis;
-    vsi_nn_fcl_param * p;
-    uint32_t i = 0;
-    uint32_t num_fc = 1, num_no_fc = 1;
-    uint32_t       num_of_dims[3]  = {0};
-    uint32_t       input_size[VSI_NN_MAX_DIM_NUM]   = {0};
-    uint32_t       output_size[VSI_NN_MAX_DIM_NUM]  = {0};
-    uint32_t       weights_size[VSI_NN_MAX_DIM_NUM] = {0};
-    int32_t        size[VSI_NN_MAX_DIM_NUM]         = {0};
-    uint32_t       ofm             = 0;
-    uint32_t       dims            = 0;
-    vx_tensor       input           = NULL;
-    vx_tensor       output          = NULL;
-    vx_tensor       weight          = NULL;
-    vx_tensor       bias            = NULL;
-    int32_t index = 0;
-    vx_border_t border;
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-    p = (vsi_nn_fcl_param *)&(self->nn_param.fcl);
-    axis = p->axis;
-
-    memcpy(input_size, inputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM);
-    num_of_dims[0] = inputs[0]->attr.dim_num;
-    memcpy(output_size, outputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM);
-    num_of_dims[1] = outputs[0]->attr.dim_num;
-    memcpy(weights_size, inputs[1]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM);
-    num_of_dims[2] = inputs[1]->attr.dim_num;
-
-    ofm = weights_size[num_of_dims[2] - 1];
-
-    for(i = 0; i <= (uint32_t)axis; ++i)
-    {
-        num_fc *= input_size[i];
-    }
-    for(i = axis + 1; i < num_of_dims[0]; ++i)
-    {
-        num_no_fc *= input_size[i];
-    }
-
-    size[0] = num_fc;
-    size[1] = num_no_fc;
-    dims= 2;
-    input = vxReshapeTensor(inputs[0]->t, size, dims);
-
-    size[0] = num_fc;
-    size[1] = ofm;
-    dims= 2;
-    weight = vxReshapeTensor(inputs[1]->t, size, dims);
-
-    size[0] = ofm;
-    size[1] = 1;
-    dims= 2;
-    bias = vxReshapeTensor(inputs[2]->t, size, dims);
-
-    size[0] = ofm;
-    size[1] = num_no_fc;
-    dims= 2;
-    output = vxReshapeTensor(outputs[0]->t, size, dims);
-
-    status |= vxSetParameterByIndex(self->n, index++, (vx_reference)input);
-    status |= vxSetParameterByIndex(self->n, index++, (vx_reference)weight);
-    status |= vxSetParameterByIndex(self->n, index++, (vx_reference)bias);
-    status |= vxSetParameterByIndex(self->n, index++, (vx_reference)output);
-
-    border.mode = VX_BORDER_CONSTANT;
-    border.constant_value.S16 = 0;
-    status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
-
-    if (input)  vxReleaseTensor(&input);
-    if (weight) vxReleaseTensor(&weight);
-    if (bias)   vxReleaseTensor(&bias);
-    if (output) vxReleaseTensor(&output);
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
-#endif
-
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
@ -259,7 +50,6 @@ static vsi_status op_compute
    )
 {
    vsi_status status = VSI_FAILURE;
-#if (USE_OVX_API == TRUE)
    uint32_t axis;
    vsi_nn_fcl_param * p;
    uint32_t i = 0;
@ -343,30 +133,7 @@ static vsi_status op_compute
    if (weight) vxReleaseTensor(&weight);
    if (bias)   vxReleaseTensor(&bias);
    if (output) vxReleaseTensor(&output);
-#else
-    vsi_nn_kernel_info_t kernel_info;

-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_fullconnect2";
-    kernel_info.type = VX_KERNEL_TYPE_VX;
-    kernel_info.kernel = vx_kernel_FCL2_list;
-    kernel_info.kernel_index = 1;
-    kernel_info.init_index = 1;
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name) free(kernel_info.resource_name);
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
-#endif
    return status;
 } /* op_compute() */

--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@ -74,6 +74,7 @@ static vsi_status op_compute
    vsi_nn_kernel_param_add_int32( param, "block_size", block_size );
    vsi_nn_kernel_param_add_int32( param, "block_num", block_num );
    vsi_nn_kernel_param_add_int32( param, "axis_num", axis_num );
+    vsi_nn_kernel_param_add_int32( param, "axis", axis );
    vsi_nn_kernel_param_add_int32( param, "indices_num", indices_num );
    n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param );
    if( n != NULL )
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
@ -41,6 +41,50 @@
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)

+static vsi_status _try_set_high_presision_tensor
+    (
+    vsi_nn_tensor_t **inputs
+    )
+{
+    vsi_status status;
+    vsi_nn_vxtensor_attr_t attr;
+
+    status = VSI_SUCCESS;
+    attr = VSI_NN_TENSOR_ATTR_HIGH_PRECISION;
+
+    if(VSI_NN_TYPE_FLOAT32 == inputs[1]->attr.dtype.vx_type)
+    {
+        status = vsi_nn_SetTensorAttr(inputs[1], attr);
+        if(VSI_SUCCESS != status)
+        {
+            return status;
+        }
+    }
+    if(VSI_NN_TYPE_FLOAT32 == inputs[2]->attr.dtype.vx_type)
+    {
+        status = vsi_nn_SetTensorAttr(inputs[2], attr);
+        if(VSI_SUCCESS != status)
+        {
+            return status;
+        }
+    }
+
+    return status;
+}
+
+static vsi_bool _is_3d_instance_norm
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs
+    )
+{
+    if( 3 == inputs[0]->attr.dim_num )
+    {
+        return TRUE;
+    }
+    return FALSE;
+} /* _is_3d_instance_norm() */
+
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
@ -55,19 +99,42 @@ static vsi_status op_compute
    uint32_t *input_size = inputs[0]->attr.size;
    uint32_t dims_num = inputs[0]->attr.dim_num;
    int32_t rs_flg = 0;
+    vsi_nn_tensor_t * tmp_inputs[3]  = {NULL, NULL, NULL};
+    vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
+    vsi_nn_instancenorm_lcl_data2 *local = self->nn_param.instancenorm.lcl2_data;

-    param =vsi_nn_kernel_param_create();
-
-    if((input_size[1] * input_size[2] < 65536)
-        && dims_num > 2)
+    status = _try_set_high_presision_tensor(inputs);
+    if(status != VSI_SUCCESS)
    {
-        rs_flg = 1;
+        VSILOGE("Set tensor attr of high presision fail");
+        return status;
    }

+    if(_is_3d_instance_norm(self, inputs))
+    {
+        tmp_inputs[0]  = local->reshaped_input;
+        tmp_outputs[0] = local->reshaped_output;
+        tmp_inputs[1] = inputs[1];
+        tmp_inputs[2] = inputs[2];
+    }
+    else
+    {
+        tmp_inputs[0] = inputs[0];
+        tmp_outputs[0] = outputs[0];
+        tmp_inputs[1] = inputs[1];
+        tmp_inputs[2] = inputs[2];
+        if((input_size[1] * input_size[2] < 65536)
+            && dims_num > 2)
+        {
+            rs_flg = 1;
+        }
+    }
+
+    param =vsi_nn_kernel_param_create();
    vsi_nn_kernel_param_add_float32( param, "eps", eps );
    vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg );
    n = vsi_nn_kernel_selector( self->graph, "instance_norm",
-                    inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+                    tmp_inputs, _INPUT_NUM, tmp_outputs, _OUTPUT_NUM, param );
    if( n != NULL )
    {
        self->n = (vx_node)n;
@ -82,6 +149,59 @@ static vsi_status op_compute
    return status;
 } /* op_compute() */

+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    uint32_t dim = 0;
+    vsi_nn_instancenorm_lcl_data2 *local = NULL;
+    uint32_t shape[VSI_NN_MAX_DIM_NUM];
+    char tensor_name[128];
+
+    dim = inputs[0]->attr.dim_num;
+    if(_is_3d_instance_norm(self, inputs) == FALSE)
+    {
+        return VSI_SUCCESS;
+    }
+
+    VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
+    /*
+        insert a reshape node before and after 3D instance_norm
+    */
+    shape[0] = 1;
+    shape[1] = inputs[0]->attr.size[0];
+    shape[2] = inputs[0]->attr.size[1];
+    shape[3] = inputs[0]->attr.size[2];
+    dim = 4;
+    local = self->nn_param.instancenorm.lcl2_data;
+    if (VSI_NN_OPTIMIZE_FORWARD == direction)
+    {
+        /* reshape 3d input (xcn) --> 4d input (whcn) */
+        local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
+    }
+    else
+    {
+        /* reshape 3d output(xcn) --> 4d output(whcn) */
+        local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
+        if(local->reshaped_output && local->reshaped_output->t)
+        {
+            memset(tensor_name, 0, sizeof(tensor_name));
+            snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
+            if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
+            {
+                VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid);
+                return VSI_FAILURE;
+            }
+        }
+    }
+
+    return VSI_SUCCESS;
+} /* op_optimize() */
+
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
@ -133,6 +253,8 @@ static vsi_status op_init
    self->nn_param.instancenorm.lcl2_data->reshapeFlg = 0;
    self->nn_param.instancenorm.lcl2_data->execute_on_sw = 0;
    self->nn_param.instancenorm.lcl2_data->hash_idx = 0;
+    self->nn_param.instancenorm.lcl2_data->reshaped_input = NULL;
+    self->nn_param.instancenorm.lcl2_data->reshaped_output = NULL;

    return status;
 } /* op_init() */
@ -143,6 +265,7 @@ static vsi_status op_deinit
    )
 {
    uint32_t i;
+    vsi_nn_instancenormalize_param *p = &(self->nn_param.instancenorm);
    for (i = 0; i < _VSI_NN_INSTANCENORM_LOCAL_TENSOR_NUM; i++)
    {
        if (self->nn_param.instancenorm.local.local_tensor[i] != NULL)
@ -151,6 +274,16 @@ static vsi_status op_deinit
            self->nn_param.instancenorm.local.local_tensor[i] = NULL;
        }
    }
+    if(p->lcl2_data->reshaped_input)
+    {
+        vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_input));
+        p->lcl2_data->reshaped_input = NULL;
+    }
+    if(p->lcl2_data->reshaped_output)
+    {
+        vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_output));
+        p->lcl2_data->reshaped_output = NULL;
+    }
    if(self->nn_param.instancenorm.lcl2_data)
    {
        free(self->nn_param.instancenorm.lcl2_data);
@ -173,7 +306,7 @@ DEF_OP_REG
    /* deinit     */ op_deinit,
    /* check      */ op_check,
    /* setup      */ vsi_nn_op_common_setup,
-    /* optimize   */ NULL,
+    /* optimize   */ op_optimize,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
@ -115,6 +115,45 @@ final:
 }


+static vsi_bool _check_value_is_equal_to_one
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor
+    )
+{
+    vsi_bool ret = TRUE;
+    float* tensor_data = NULL;
+    uint32_t elements = 0;
+    uint32_t i = 0;
+
+    elements = vsi_nn_GetElementNum( tensor );
+    tensor_data = vsi_nn_ConvertTensorToFloat32Data( graph, tensor );
+    if ( NULL == tensor_data )
+    {
+        VSILOGE( "Convert data fail." );
+        return FALSE;
+    }
+
+    for (i = 0; i < elements; i++)
+    {
+        if ( vsi_abs(tensor_data[i] - 1.0f) > 1e-5 )
+        {
+            ret = FALSE;
+            break;
+        }
+    }
+
+    if ( !tensor->attr.is_created_from_handle )
+    {
+        if ( tensor_data )
+        {
+            free(tensor_data);
+        }
+    }
+
+    return ret;
+}
+
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
@ -141,6 +180,11 @@ static vsi_status op_compute
    p = &(self->nn_param.l2normalizescale);
    axis = p->axis;

+    if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one(self->graph, inputs[1]) )
+    {
+        return vsi_nn_internal_compute_node( self );
+    }
+
    param =vsi_nn_kernel_param_create();

    ret = vsi_nn_kernel_optimize_reduce_shape(
@ -240,6 +284,9 @@ static vsi_status op_deinit
            self->nn_param.l2normalizescale.local.local_tensor[i] = NULL;
        }
    }
+
+    vsi_nn_internal_deinit_node_wksp( self );
+
    vsi_nn_op_common_deinit(self);

    return VSI_SUCCESS;
@ -253,11 +300,15 @@ static vsi_bool op_setup
    )
 {
    vsi_bool ret = TRUE;
+    vsi_nn_internal_node_t* curr = NULL;
+
    if( NULL == self )
    {
        return FALSE;
    }

+    vsi_nn_internal_init_node_wksp( self );
+
    if (self->nn_param.l2normalizescale.axis < 0)
    {
        self->nn_param.l2normalizescale.axis += (int32_t)inputs[0]->attr.dim_num;
@ -269,6 +320,15 @@ static vsi_bool op_setup
        return FALSE;
    }

+    if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one( self->graph, inputs[1] ) )
+    {
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0);
+        curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis;
+        curr->inputs[0] = inputs[0];
+        curr->outputs[0] = outputs[0];
+        vsi_nn_internal_setup_node( self, curr );
+    }
+
    ret = vsi_nn_op_common_setup(self, inputs, outputs);

    return ret;
@ -280,7 +340,7 @@ static vsi_status op_init
    )
 {
    vsi_status status = VSI_SUCCESS;
-    uint32_t  i;
+    uint32_t  i = 0;

    if (vsi_nn_compareVersion(self->graph, 1, 1, 13) == -1)
    {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@ -35,312 +35,11 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 #include "client/vsi_nn_vxkernel.h"
+#include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"

-#define _ARG_NUM            (1)
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
-
-extern vx_kernel_description_t * vx_kernel_LAYERNORM_list[];
-
-static void check_tensor_shape
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t * input,
-    vx_reference * params,
-    uint32_t index,
-    vx_bool rsFlg
-    )
-{
-    vsi_nn_tensor_attr_t attr;
-
-    if (index == 0 )
-    {
-        if(input->attr.dim_num == 1)
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.size[1] = 1;
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else if ((input->attr.dim_num == 3 && input->attr.size[2] == 1)
-            ||(input->attr.dim_num == 4 && input->attr.size[2] == 1 && input->attr.size[3] == 1))
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else
-            params[index] = (vx_reference)input->t;
-    }
-    else if(index == 1 )
-    {
-        if(input->attr.dim_num == 1)
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.size[1] = 1;
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else
-             params[index] = (vx_reference)input->t;
-
-    }
-    else if(index == 2)
-    {
-        if(input->attr.dim_num == 1)
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.size[1] = 1;
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else
-             params[index] = (vx_reference)input->t;
-    }
-    else if(index == 3)
-    {
-        if(input->attr.dim_num == 1)
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.size[1] = 1;
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else if ((input->attr.dim_num == 3 && input->attr.size[2] == 1)
-            ||(input->attr.dim_num == 4 && input->attr.size[2] == 1 && input->attr.size[3] == 1))
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else
-             params[index] = (vx_reference)input->t;
-    }
-    else
-    {
-        VSILOGE("No more local tensor!(LAYERNORM) at [%s : %d]\n", __FILE__, __LINE__);
-    }
-}
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_layernormalize_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.layernorm);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_FLOAT32, eps );
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_pre_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_kernel_info_t * kernel_info
-    )
-{
-    vsi_nn_type_e inputDataFormat     = inputs[0]->attr.dtype.vx_type;
-    vsi_nn_type_e outputDataFormat    = outputs[0]->attr.dtype.vx_type;
-    vsi_nn_type_e scaleDataFormat     = inputs[2]->attr.dtype.vx_type;
-    if (inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16
-        && scaleDataFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        kernel_info->kernel_index = 1;
-    }
-    else if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8
-        && scaleDataFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        kernel_info->kernel_index = 2;
-    }
-    else if (inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_UINT8
-        && scaleDataFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        kernel_info->kernel_index = 3;
-    }
-    else if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_FLOAT16
-        && scaleDataFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        kernel_info->resource_name[0] = "vsi_nn_kernel_layernormalize_U8";
-        kernel_info->kernel_index = 4;
-    }
-    else
-    {
-        VSILOGE("Not support input or output data format!(LAYERNORM) at [%s : %d]\n", __FILE__, __LINE__);
-        return VSI_FAILURE;
-    }
-    return VSI_SUCCESS;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_border_t border;
-    vx_reference * args;
-    vx_bool rsFlg = FALSE;
-    int32_t in_zp;
-    vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type;
-    vsi_nn_tensor_attr_t attr;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    //_set_inputs_outputs( params, inputs, outputs );
-    check_tensor_shape(self, inputs[0], params, 0, rsFlg);
-    check_tensor_shape(self, inputs[1], params, 1, rsFlg);
-    check_tensor_shape(self, inputs[2], params, 2, rsFlg);
-    check_tensor_shape(self, outputs[0], params, 3, rsFlg);
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status  = vsi_nn_vxGetTensorAttr(inputs[0]->t,  &attr);
-    in_zp = attr.dtype.zero_point;
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    border.mode = VX_BORDER_CONSTANT;
-    border.constant_value.U32 = 0;
-    border.constant_value.S16 = 0;
-    border.constant_value.U8 = 0;
-    if(inputDataFormat == VSI_NN_TYPE_UINT8)
-    {
-        border.constant_value.U32 = (vx_uint32)in_zp;
-        border.constant_value.S16 = (vx_int16)in_zp;
-        border.constant_value.U8 = (vx_uint8)in_zp;
-    }
-    status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};

 static vsi_status op_compute
    (
@ -349,35 +48,44 @@ static vsi_status op_compute
    vsi_nn_tensor_t ** outputs
    )
 {
-    vsi_status status;
-    vsi_nn_kernel_info_t kernel_info;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_kernel_node_t    n = NULL;
+    float eps = self->nn_param.instancenorm.eps;
+    uint32_t *input_size = inputs[0]->attr.size;
+    uint32_t dims_num = inputs[0]->attr.dim_num;
+    int32_t rs_flg = 0;
+    int32_t wh_flg = 0;

-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_layernormalize";
-    kernel_info.type = vsi_nn_GetVXKernelTypeForShader();
-    kernel_info.kernel = vx_kernel_LAYERNORM_list;
-    kernel_info.init_index = 1;
+    param =vsi_nn_kernel_param_create();

-    if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type))
+    if (input_size[0] >= GPU_TENSOR_MAX_WIDTH)
    {
-        vx_op_pre_compute(self, inputs, outputs, &kernel_info);
+        wh_flg = 1;
    }

-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name) free(kernel_info.resource_name);
-    if( NULL == self->n )
+    if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH)
+        && dims_num > 2)
    {
-        return VSI_FAILURE;
+        rs_flg = 1;
    }

-    if (NULL != op_compute_list[kernel_info.init_index])
+    vsi_nn_kernel_param_add_float32( param, "eps", eps );
+    vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg );
+    vsi_nn_kernel_param_add_int32( param, "wh_flg", wh_flg );
+    n = vsi_nn_kernel_selector( self->graph, "layer_norm",
+                    inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+    if ( n != NULL )
    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
    }
+
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+
    return status;
 } /* op_compute() */

@ -389,10 +97,12 @@ static vsi_bool op_check
    )
 {
    BEGIN_IO_TYPE_DECL(LAYER_NORM, 3, 1)
+        IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32)
        IO_TYPE(D_F16,  D_F32,  D_F16,  D_F16)
        IO_TYPE(D_F16,  D_F32,  D_F16,  D_U8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
    END_IO_TYPE_DECL(LAYER_NORM)
    if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num))
    {
@ -438,8 +148,8 @@ DEF_OP_REG
    /* check      */ op_check,
    /* setup      */ vsi_nn_op_common_setup,
    /* optimize   */ NULL,
-    /* input_num  */ 3,
-    /* output_num */ 1
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
    );
 #ifdef __cplusplus
 }
--- a/Show More
+++ b/Show More