From c14141623833f9af115cb2fdfb3f61cf45703182 Mon Sep 17 00:00:00 2001
From: Kainan Cha <kainan.zha@verisilicon.com>
Date: Mon, 29 Mar 2021 16:21:46 +0800
Subject: [PATCH] Update internal to REL/v1.1.30.2

SHA: 2e64046f

Signed-off-by: Kainan Cha <kainan.zha@verisilicon.com>
---
 src/tim/vx/internal/BUILD                     |    9 -
 src/tim/vx/internal/include/interface/ops.def |    1 +
 .../include/internal/internal_ops.def         |    1 +
 .../kernel/vsi_nn_kernel_gpu_shape_optimize.h |   20 +
 .../internal/include/libnnext/vx_lib_nnext.h  |    4 -
 .../include/ops/vsi_nn_op_instancenormalize.h |    4 +
 .../ops/vsi_nn_op_space2depth_internal.h      |   44 +
 .../include/ops/vsi_nn_op_upsamplescale.h     |   39 +
 src/tim/vx/internal/include/vsi_nn_graph.h    |    5 +
 .../vx/internal/include/vsi_nn_node_type.h    |    4 +
 .../include/vsi_nn_pre_post_process.h         |    6 +
 .../src/kernel/cl/instance_normalization_cl.c |   78 +-
 .../src/kernel/cl/layer_normalization_cl.c    |  395 +
 .../vx/internal/src/kernel/cl/matrixmul_cl.c  |  124 +-
 .../vx/internal/src/kernel/cl/roi_align_cl.c  |  329 +
 .../src/kernel/cl/space2depth_internal_cl.c   |  298 +
 .../kernel/cpu/instance_normalization_cpu.c   |    2 +-
 .../src/kernel/cpu/layer_normalization_cpu.c  |  255 +
 .../internal/src/kernel/cpu/roi_align_cpu.c   |  378 +
 .../src/kernel/cpu/space2depth_internal_cpu.c |  230 +
 .../src/kernel/cpu/upsamplescale_cpu.c        |  264 +
 .../src/kernel/evis/a_times_b_plus_c_evis.c   |   56 +-
 .../vx/internal/src/kernel/evis/gather_evis.c |  299 +-
 .../kernel/evis/instance_normalization_evis.c |  118 +-
 .../kernel/evis/layer_normalization_evis.c    | 1389 ++++
 .../src/kernel/evis/pre_process_bgra_evis.c   |   64 +-
 .../src/kernel/evis/pre_process_nv12_evis.c   |  101 +-
 .../src/kernel/evis/pre_process_rgb_evis.c    |  233 +-
 .../src/kernel/evis/pre_process_yuv420_evis.c |  245 +-
 .../src/kernel/evis/pre_process_yuv444_evis.c |  250 +-
 .../src/kernel/evis/resize_bilinear_evis.c    |  126 +-
 .../kernel/evis/space2depth_internal_evis.c   |  366 +
 .../src/kernel/evis/upsamplescale_evis.c      |  422 ++
 .../kernel/vsi_nn_kernel_gpu_shape_optimize.c |  154 +-
 src/tim/vx/internal/src/kernel/vx/clip_vx.c   |    6 +-
 .../vx/internal/src/kernel/vx/convolutional.c |   15 +-
 .../internal/src/kernel/vx/eltwise_unary_vx.c |    6 +-
 .../vx/internal/src/kernel/vx/relu_keras_vx.c |    6 +-
 .../libnnext/ops/cl/layer_normalization.cl    |  143 +
 .../internal/src/libnnext/ops/cl/matrixmul.cl |  190 +-
 .../src/libnnext/ops/cl/matrixmul_transA.cl   |   40 +-
 .../internal/src/libnnext/ops/cl/roi_align.cl |  108 +
 .../libnnext/ops/cl/space2depth_internal.cl   |   90 +
 .../libnnext/ops/kernel/vsi_nn_kernel_crop.c  |  253 -
 .../ops/kernel/vsi_nn_kernel_fullconnect2.c   |  323 -
 .../ops/kernel/vsi_nn_kernel_layernormalize.c |  688 --
 .../ops/kernel/vsi_nn_kernel_reduce.c         |  190 -
 .../ops/kernel/vsi_nn_kernel_resize.c         |  283 -
 .../ops/kernel/vsi_nn_kernel_roi_align.c      |  317 -
 .../libnnext/ops/kernel/vsi_nn_kernel_scale.c |  410 -
 .../ops/kernel/vsi_nn_kernel_shufflechannel.c |  345 -
 .../ops/kernel/vsi_nn_kernel_space2depth.c    |  293 -
 .../src/libnnext/ops/vx/a_times_b_plus_c.vx   |   78 +
 .../vx/internal/src/libnnext/ops/vx/gather.vx |  121 +-
 .../src/libnnext/ops/vx/gather_mix.vx         |  106 +-
 .../libnnext/ops/vx/layer_normalization.vx    |  279 +
 ...normalize.vx => layer_normalization_2d.vx} |   62 +-
 .../ops/vx/layer_normalization_i16.vx         |  167 +
 .../ops/vx/layer_normalization_u8_f16.vx      |  252 +
 .../ops/vx/layer_normalization_wh_f16.vx      |  426 ++
 .../ops/vx/layer_normalization_wh_i16.vx      |  266 +
 .../ops/vx/layer_normalization_wh_u8.vx       |  419 ++
 .../libnnext/ops/vx/pre_process_bgra_trans.vx |  136 -
 .../ops/vx/pre_process_nv12_trans_u8.vx       |   89 -
 .../ops/vx/pre_process_rgb_copy_trans.vx      |   94 -
 .../libnnext/ops/vx/pre_process_rgb_trans.vx  |  172 -
 .../ops/vx/pre_process_yuv420_copy_u8.vx      |  147 -
 .../ops/vx/pre_process_yuv420_trans_u8.vx     |  235 -
 .../ops/vx/pre_process_yuv444_copy_u8.vx      |  147 -
 .../ops/vx/pre_process_yuv444_trans_u8.vx     |  196 -
 .../libnnext/ops/vx/resize_bilinear_BF16.vx   |  144 +-
 .../libnnext/ops/vx/resize_bilinear_F16.vx    |  315 +-
 .../libnnext/ops/vx/resize_bilinear_I16.vx    |  212 +-
 .../src/libnnext/ops/vx/resize_bilinear_I8.vx |  189 +-
 .../src/libnnext/ops/vx/resize_bilinear_U8.vx |  276 +-
 .../libnnext/ops/vx/resize_bilinear_U8_opt.vx |   16 +-
 .../src/libnnext/ops/vx/resize_nearest.vx     |  202 +-
 .../libnnext/ops/vx/space2depth_internal.vx   |  135 +
 .../src/libnnext/ops/vx/upsamplescale.vx      |   58 +
 .../src/libnnext/ops/vx/upsamplescale_k2.vx   |   83 +
 .../src/libnnext/ops/vx/vsi_nn_kernel_crop.vx |  111 -
 .../ops/vx/vsi_nn_kernel_fullconnect2.vx      |   63 -
 .../ops/vx/vsi_nn_kernel_layernormalize_U8.vx |  129 -
 .../libnnext/ops/vx/vsi_nn_kernel_resize.vx   |   38 -
 .../libnnext/ops/vx/vsi_nn_kernel_scale.vx    |   49 -
 .../ops/vx/vsi_nn_kernel_shufflechannel.vx    |   67 -
 .../vx/vsi_nn_kernel_shufflechannel_axis1.vx  |   65 -
 .../ops/vx/vsi_nn_kernel_space2depth.vx       |   41 -
 .../src/libnnext/vsi_nn_libnnext_resource.c   | 6652 ++++++++++-------
 src/tim/vx/internal/src/makefile.linux        |    7 +-
 .../vx/internal/src/ops/vsi_nn_op_argmaxmin.c |    5 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_crop.c  |  215 -
 .../internal/src/ops/vsi_nn_op_dataconvert.c  |    3 +
 .../src/ops/vsi_nn_op_deconvolution.c         |    1 +
 .../src/ops/vsi_nn_op_embedding_lookup.c      |    4 +
 .../internal/src/ops/vsi_nn_op_fullconnect2.c |  233 -
 .../vx/internal/src/ops/vsi_nn_op_gather.c    |    1 +
 .../src/ops/vsi_nn_op_instancenormalize.c     |  147 +-
 .../src/ops/vsi_nn_op_l2normalizescale.c      |   62 +-
 .../src/ops/vsi_nn_op_layernormalize.c        |  356 +-
 .../vx/internal/src/ops/vsi_nn_op_matrixmul.c |   20 +-
 .../internal/src/ops/vsi_nn_op_pre_process.c  |  622 +-
 .../src/ops/vsi_nn_op_pre_process_bgra.c      |   25 +-
 .../src/ops/vsi_nn_op_pre_process_nv12.c      |   25 +-
 .../src/ops/vsi_nn_op_pre_process_rgb.c       |   12 +-
 .../src/ops/vsi_nn_op_pre_process_yuv420.c    |   25 +-
 .../src/ops/vsi_nn_op_pre_process_yuv444.c    |   25 +-
 .../vx/internal/src/ops/vsi_nn_op_reduce.c    |  146 -
 .../vx/internal/src/ops/vsi_nn_op_resize.c    |  245 +-
 .../vx/internal/src/ops/vsi_nn_op_roi_align.c |  242 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_scale.c |  280 +-
 .../src/ops/vsi_nn_op_shufflechannel.c        |  289 -
 .../internal/src/ops/vsi_nn_op_space2depth.c  |  330 +-
 .../src/ops/vsi_nn_op_space2depth_internal.c  |  159 +
 .../src/ops/vsi_nn_op_tensorstackconcat.c     |    8 +-
 .../vx/internal/src/ops/vsi_nn_op_unstack.c   |    2 +-
 .../src/ops/vsi_nn_op_upsamplescale.c         |  253 +
 .../src/utils/vsi_nn_code_generator.c         |    1 +
 src/tim/vx/internal/src/vsi_nn_graph.c        |  225 +-
 .../vx/internal/src/vsi_nn_pre_post_process.c |   25 +
 120 files changed, 14252 insertions(+), 11997 deletions(-)
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_space2depth_internal.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h
 create mode 100644 src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/layer_normalization.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/space2depth_internal.cl
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx
 rename src/tim/vx/internal/src/libnnext/ops/vx/{vsi_nn_kernel_layernormalize.vx => layer_normalization_2d.vx} (87%)
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/space2depth_internal.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c

diff --git a/src/tim/vx/internal/BUILD b/src/tim/vx/internal/BUILD
index ae21b3d..1803e76 100644
--- a/src/tim/vx/internal/BUILD
+++ b/src/tim/vx/internal/BUILD
@@ -194,22 +194,13 @@ cc_library(
         "src/kernel/vsi_nn_kernel_param.c",
         "src/kernel/vsi_nn_gpu.c",
         "src/kernel/vsi_nn_kernel_gpu_shape_optimize.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_crop.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_resize.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_scale.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_topk.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c",
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index 88c74c2..523f299 100644
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -146,3 +146,4 @@ DEF_OP(SCATTER_ND)
 DEF_OP(DECONVOLUTION1D)
 DEF_OP(INTERP)
 DEF_OP(RESIZE_1D)
+DEF_OP(UPSAMPLESCALE)
diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def
index e8f677b..ab04552 100644
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@@ -16,3 +16,4 @@ DEF_OP(GRUCELL_ACTIVATION_INTERNAL)
 DEF_OP(GRUCELL_ACTIVATION_INTERNAL_SMA)
 DEF_OP(RESIZE_1D_BILINEAR_INTERNAL)
 DEF_OP(RESIZE_1D_NEAREST_INTERNAL)
+DEF_OP(SPACE2DEPTH_INTERNAL)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
index bf2b95d..0b65afc 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
@@ -38,6 +38,14 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape
     int32_t* out_axis, uint32_t* out_axis_size
     );
 
+vsi_bool vsi_nn_kernel_optimize_tensor_shape
+    (
+    const int32_t* shape_x, const size_t rank_x,
+    const int32_t *axis, const size_t axis_size,
+    int32_t* out_shape_x, uint32_t* out_rank_x,
+    int32_t* out_axis, uint32_t* out_axis_size
+    );
+
 vsi_bool vsi_nn_kernel_optimize_element_shape
     (
     const int32_t* shape_x, const size_t rank_x,
@@ -59,4 +67,16 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
     int32_t* out_shape_output, uint32_t* out_rank_output
     );
 
+vsi_bool vsi_nn_kernel_optimize_1d_tensor_shape
+    (
+    const int32_t* shape, const uint32_t rank,
+    int32_t* out_shape, uint32_t* out_rank
+    );
+
+vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
+    (
+    const int32_t* shape, const uint32_t rank,
+    int32_t* out_shape, uint32_t* out_rank
+    );
+
 #endif
diff --git a/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h b/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h
index 4941769..2245dff 100644
--- a/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h
+++ b/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h
@@ -372,10 +372,6 @@ enum vx_kernel_libnnext_offset_e
 #define VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT16_COPY        VIVANTE_NAMESPACE ".GrayScaletoTensor_Int16_copy"
 #define VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8             VIVANTE_NAMESPACE ".GrayScaletoTensor_UInt8"
 #define VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8_COPY        VIVANTE_NAMESPACE ".GrayScaletoTensor_UInt8_copy"
-#define VX_KERNEL_NAME_LAYERNORM                           VIVANTE_NAMESPACE ".vxcLayerNorm"
-#define VX_KERNEL_NAME_LAYERNORM_UINT8                     VIVANTE_NAMESPACE ".vxcLayerNorm_u8"
-#define VX_KERNEL_NAME_LAYERNORM_FP16TOU8                  VIVANTE_NAMESPACE ".vxcLayerNormFP16toU8"
-#define VX_KERNEL_NAME_LAYERNORM_U8TOFP16                  VIVANTE_NAMESPACE ".vxcLayerNormU8toFp16"
 #define VX_KERNEL_NAME_TENSORSTACKCONCAT                   VIVANTE_NAMESPACE ".vxcTensorStackConcat"
 #define VX_KERNEL_NAME_TENSORSTACKCONCAT8BITS              VIVANTE_NAMESPACE ".vxcTensorStackConcat8Bits"
 #define VX_KERNEL_NAME_SIGNALFRAME_WIDTH                   VIVANTE_NAMESPACE ".vxcSignalFrame_width"
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h
index 5ec359b..e70dc41 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h
@@ -70,6 +70,10 @@ typedef struct _vsi_nn_instancenorm_lcl_data2
     uint32_t reshapeFlg;
     uint32_t hash_idx;
     vsi_bool execute_on_sw;
+
+    /* handle 3D instance norm */
+    vsi_nn_tensor_t *reshaped_input;
+    vsi_nn_tensor_t *reshaped_output;
 } vsi_nn_instancenorm_lcl_data2;
 
 typedef struct _vsi_nn_instancenorm_lcl_data
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_space2depth_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_space2depth_internal.h
new file mode 100644
index 0000000..e5630ca
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_space2depth_internal.h
@@ -0,0 +1,44 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_SPACE2DEPTH_INTERNAL_H
+#define _VSI_NN_OP_SPACE2DEPTH_INTERNAL_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_space2depth_internal_param
+{
+    int32_t block_size_x;
+    int32_t block_size_y;
+} vsi_nn_space2depth_internal_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h b/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h
new file mode 100644
index 0000000..f790da2
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h
@@ -0,0 +1,39 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_UPSAMPLESCALE_H
+#define _VSI_NN_OP_UPSAMPLESCALE_H
+
+#include "vsi_nn_types.h"
+
+typedef struct _vsi_nn_upsamplescale_param
+{
+    struct _upsamplescale_local_data_t* local;
+    // Add parameters here
+    int32_t stride;
+    float scale;
+} vsi_nn_upsamplescale_param;
+
+#endif
+
diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h
index 6e3e6fd..584bdd8 100644
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@@ -677,6 +677,11 @@ OVXLIB_API vsi_status vsi_nn_TrySetupCompleteSignalNode
     vsi_nn_graph_t* graph
     );
 
+vsi_status vsi_nn_setup_binary_graph_inputs_outputs
+    (
+    vsi_nn_graph_t* graph
+    );
+
 void  vsi_nn_get_tensor_consumers
     (
     vsi_nn_graph_t* graph,
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index 9f13725..89cd104 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -56,6 +56,7 @@
 #include "ops/vsi_nn_op_elu.h"
 #include "ops/vsi_nn_op_reverse.h"
 #include "ops/vsi_nn_op_space2depth.h"
+#include "ops/vsi_nn_op_space2depth_internal.h"
 #include "ops/vsi_nn_op_depth2space.h"
 #include "ops/vsi_nn_op_depth2space_internal.h"
 #include "ops/vsi_nn_op_maximum.h"
@@ -162,6 +163,7 @@
 #include "ops/vsi_nn_op_resize_1d.h"
 #include "ops/vsi_nn_op_resize_1d_bilinear_internal.h"
 #include "ops/vsi_nn_op_resize_1d_nearest_internal.h"
+#include "ops/vsi_nn_op_upsamplescale.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 
@@ -204,6 +206,7 @@ typedef union _vsi_nn_nn_param
     vsi_nn_elu_param                elu;
     vsi_nn_reverse_param            reverse;
     vsi_nn_space2depth_param        space2depth;
+    vsi_nn_space2depth_internal_param space2depth_internal;
     vsi_nn_depth2space_param        depth2space;
     vsi_nn_depth2space_internal_param depth2space_internal;
     vsi_nn_maximum_param            maximum;
@@ -310,6 +313,7 @@ typedef union _vsi_nn_nn_param
     vsi_nn_resize_1d_param          resize_1d;
     vsi_nn_resize_1d_bilinear_internal_param resize_1d_bilinear_internal;
     vsi_nn_resize_1d_nearest_internal_param resize_1d_nearest_internal;
+    vsi_nn_upsamplescale_param      upsamplescale;
     uint8_t                         client_param[128];
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
index 501fca3..74938f7 100644
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@@ -65,6 +65,12 @@ typedef enum
     VSI_NN_SOURCE_LAYOUT_NCHW,
 } vsi_nn_preprocess_source_layout_e;
 
+typedef enum
+{
+    VSI_NN_DEST_LAYOUT_NHWC = 0,
+    VSI_NN_DEST_LAYOUT_NCHW,
+} vsi_nn_preprocess_dest_layout_e;
+
 /**
  * Input source format
  */
diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
index 1b73c36..fe470a0 100644
--- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
@@ -214,7 +214,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
     width = input_shape->data[0];
     height = input_shape->data[1];
     chn = attr[1]->shape->data[1];
-    if(rsFlg)
+    if (rsFlg)
     {
         height = height / chn;
     }
@@ -281,7 +281,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
     width = input_shape->data[0];
     height = input_shape->data[1];
     chn = attr[1]->shape->data[1];
-    if(rsFlg)
+    if (rsFlg)
     {
         height = height / chn;
     }
@@ -355,12 +355,12 @@ static vsi_status _query_kernel
 
     for( i = 0; i < kernel_map_size; i ++ )
     {
-        if( kernel_map[i].key == hashkey )
+        if ( kernel_map[i].key == hashkey )
         {
             break;
         }
     }
-    if( i < kernel_map_size )
+    if ( i < kernel_map_size )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters  = param_def;
@@ -413,19 +413,23 @@ static vsi_nn_kernel_node_t _setup
     int32_t width = inputs[0]->attr.size[0];
     int32_t height = inputs[0]->attr.size[1];
     int32_t group_num = (width + 15) / 16;
-    int32_t input_zp = inputs[0]->attr.dtype.zero_point;
-    float input_scale = inputs[0]->attr.dtype.scale;
-    int32_t input_fl = inputs[0]->attr.dtype.fl;
-    int32_t output_zp = outputs[0]->attr.dtype.zero_point;
-    float output_scale = outputs[0]->attr.dtype.scale;
-    int32_t output_fl = outputs[0]->attr.dtype.fl;
+    int32_t input_zp = 0;
+    float input_scale = 1.0f;
+    int32_t input_fl = 0;
+    int32_t output_zp = 0;
+    float output_scale = 1.0f;
+    int32_t output_fl = 0;
     float in_fl_scale = 1.0f, out_fl_scale = 1.0;
     float dim_ratio = (float)1.0 / (float)(width * height);
 
-    if(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT8
-        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
-        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
     {
+        input_zp = inputs[0]->attr.dtype.zero_point;
+        input_scale = inputs[0]->attr.dtype.scale;
+    }
+    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        input_fl = inputs[0]->attr.dtype.fl;
         if (input_fl > 0)
         {
             in_fl_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
@@ -434,12 +438,17 @@ static vsi_nn_kernel_node_t _setup
         {
             in_fl_scale = ((float) ((int64_t)1 << -input_fl));
         }
+        input_zp = 0;
     }
 
-    if(outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT8
-        || outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
-        || outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
     {
+        output_zp = outputs[0]->attr.dtype.zero_point;
+        output_scale = 1.0f / outputs[0]->attr.dtype.scale;
+    }
+    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        output_fl = outputs[0]->attr.dtype.fl;
         if (output_fl > 0)
         {
             out_fl_scale = (float)((int64_t)1 << output_fl);
@@ -448,9 +457,10 @@ static vsi_nn_kernel_node_t _setup
         {
             out_fl_scale = (1.0f / (float)((int64_t)1 << -output_fl));
         }
+        output_zp = 0;
     }
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
@@ -482,17 +492,17 @@ static vsi_nn_kernel_node_t _setup
     hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );
 
     status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
-    if( VSI_SUCCESS != status )
+    if ( VSI_SUCCESS != status )
     {
         goto final;
     }
     status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
-    if( VSI_SUCCESS != status )
+    if ( VSI_SUCCESS != status )
     {
         goto final;
     }
 
-    if(reshape_flg)
+    if (reshape_flg)
     {
         int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
         shape[0] = inputs[0]->attr.size[0];
@@ -507,7 +517,7 @@ static vsi_nn_kernel_node_t _setup
         shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
         rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
     }
-    if(inputs[1]->attr.dim_num < 2)
+    if (inputs[1]->attr.dim_num < 2)
     {
         int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
         shape[0] = inputs[1]->attr.size[0];
@@ -516,7 +526,7 @@ static vsi_nn_kernel_node_t _setup
         shape[3] = 1;
         rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
     }
-    if(inputs[2]->attr.dim_num < 2)
+    if (inputs[2]->attr.dim_num < 2)
     {
         int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
         shape[0] = inputs[2]->attr.size[0];
@@ -528,10 +538,10 @@ static vsi_nn_kernel_node_t _setup
     // Mean Vari
     {
         node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
-        if(node)
+        if (node)
         {
             uint32_t index = 0;
-            if(reshape_flg)
+            if (reshape_flg)
             {
                 mean_vari_node_params[index++] = rs_input;
             }
@@ -565,10 +575,10 @@ static vsi_nn_kernel_node_t _setup
     // Nomalization
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if(node)
+        if (node)
         {
             uint32_t index = 0;
-            if(reshape_flg)
+            if (reshape_flg)
             {
                 node_params[index++] = rs_input;
             }
@@ -576,7 +586,7 @@ static vsi_nn_kernel_node_t _setup
             {
                 node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
             }
-            if(inputs[1]->attr.dim_num < 2)
+            if (inputs[1]->attr.dim_num < 2)
             {
                 node_params[index++] = rs_beta;
             }
@@ -584,7 +594,7 @@ static vsi_nn_kernel_node_t _setup
             {
                 node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
             }
-            if(inputs[2]->attr.dim_num < 2)
+            if (inputs[2]->attr.dim_num < 2)
             {
                 node_params[index++] = rs_gamma;
             }
@@ -593,7 +603,7 @@ static vsi_nn_kernel_node_t _setup
                 node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
             }
             node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
-            if(reshape_flg)
+            if (reshape_flg)
             {
                 node_params[index++] = rs_output;
             }
@@ -634,26 +644,26 @@ static vsi_nn_kernel_node_t _setup
 
     /* Pass parameters to node. */
 final:
-    if(rs_beta)
+    if (rs_beta)
     {
         vsi_nn_kernel_tensor_release( &rs_beta );
     }
-    if(rs_gamma)
+    if (rs_gamma)
     {
         vsi_nn_kernel_tensor_release( &rs_gamma );
     }
-    if(reshape_flg)
+    if (reshape_flg)
     {
         vsi_nn_kernel_tensor_release( &rs_input );
         vsi_nn_kernel_tensor_release( &rs_output );
     }
     for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
     {
-        if( ikernels[i] )
+        if ( ikernels[i] )
         {
             vsi_nn_kernel_release( &ikernels[i] );
         }
-        if( tensors[i] )
+        if ( tensors[i] )
         {
             vsi_nn_ReleaseTensor( &tensors[i] );
         }
diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
new file mode 100644
index 0000000..166f779
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
@@ -0,0 +1,395 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define KERNEL_SOURCE_1    "layer_normalization"
+
+#define HASH_LAYERNORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.layer_norm_"#SRC0_TYPE"to"#DST_TYPE)
+
+// Add kernel hashtable here
+#define HASH_LAYERNORM_KEY(_input0_type, _output_type, _reshape_flag) \
+    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
+
+#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_LAYERNORM_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _layernorm_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_LAYERNORM_KERNELS( F32, F32, KERNEL_SOURCE_1 )
+    TENSOR_LAYERNORM_KERNELS( U8,  U8, KERNEL_SOURCE_1 )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _layernorm_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _LAYERNORM_PARAM_NUM  _cnt_of_array( _layernorm_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+
+DEF_KERNEL_INITIALIZER(_layernorm_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_int_array_t * input_shape = NULL;
+    //int32_t width = 0;
+    int32_t height = 0;
+    int32_t chn = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    input_shape  = attr[0]->shape;
+    //width = input_shape->data[0];
+    height = input_shape->data[1];
+    chn = (input_shape->size <= 2) ? 1 : input_shape->data[2];
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.local_size[0]    = 16;
+    gpu_param.local_size[1]    = 1;
+    gpu_param.local_size[2]    = 1;
+    gpu_param.global_size[0]   = 16;
+    gpu_param.global_size[1]   = height;
+    gpu_param.global_size[2]   = chn;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    return status;
+} /* _layernorm_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t* kernel,
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    int32_t reshape2D
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (input0_dtype == F16 && output_dtype == F16)
+    {
+        input0_dtype = F32;
+        output_dtype = F32;
+    }
+
+    key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, 0 );
+
+    for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ )
+    {
+        if ( _layernorm_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(_layernorm_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _layernorm_kernel_map[i].function_name );
+        kernel->info.parameters = _layernorm_kernel_param_def;
+        kernel->info.numParams = _LAYERNORM_PARAM_NUM;
+        kernel->info.initialize = _layernorm_initializer;
+
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                _layernorm_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _layernorm_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_tensor_t rs_gamma = NULL, rs_beta = NULL;
+
+    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+
+    int32_t width = inputs[0]->attr.size[0];
+    int32_t height = inputs[0]->attr.size[1];
+    int32_t input_fl = 0;
+    float input_zp = 0.0f;
+    float input_scale = 1.0f;
+    int32_t output_fl = 0;
+    float output_zp = 0.0f;
+    float output_scale = 1.0f;
+    float e2InScale = 1.0f, scale_inOut = 1.0f;
+    float dim_ratio = (float)1.0 / (float)(width);
+    float sumZpScale = 0.0f;
+    float zp2ScaleE2 = 0.0f;
+    float sumZpScaleE2 = 0.0f;
+
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        input_zp = (float)inputs[0]->attr.dtype.zero_point;
+        input_scale = inputs[0]->attr.dtype.scale;
+    }
+    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        input_fl = inputs[0]->attr.dtype.fl;
+        if (input_fl > 0)
+        {
+            input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
+        }
+        else
+        {
+            input_scale = ((float) ((int64_t)1 << -input_fl));
+        }
+        input_zp = 0.0f;
+    }
+
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        output_zp = (float)outputs[0]->attr.dtype.zero_point;
+        output_scale = 1.0f / outputs[0]->attr.dtype.scale;
+    }
+    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        output_fl = outputs[0]->attr.dtype.fl;
+        if (output_fl > 0)
+        {
+            output_scale = (float)((int64_t)1 << output_fl);
+        }
+        else
+        {
+            output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
+        }
+        output_zp = 0.0f;
+    }
+    scale_inOut = input_scale * output_scale;
+    e2InScale = input_scale * input_scale;
+    sumZpScale = width * input_zp * input_scale;
+    zp2ScaleE2 = input_zp * 2 * e2InScale;
+    sumZpScaleE2 = width * input_zp * input_zp * e2InScale;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, 0 );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
+    if (inputs[1]->attr.dim_num < 2)
+    {
+        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
+        shape[0] = inputs[1]->attr.size[0];
+        shape[1] = 1;
+        shape[2] = 1;
+        shape[3] = 1;
+        rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
+    }
+    if (inputs[2]->attr.dim_num < 2)
+    {
+        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
+        shape[0] = inputs[2]->attr.size[0];
+        shape[1] = 1;
+        shape[2] = 1;
+        shape[3] = 1;
+        rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 );
+    }
+
+    // Nomalization
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if (node)
+        {
+            uint32_t index = 0;
+            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
+            if (inputs[1]->attr.dim_num < 2)
+            {
+                node_params[index++] = rs_beta;
+            }
+            else
+            {
+                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
+            }
+            if (inputs[2]->attr.dim_num < 2)
+            {
+                node_params[index++] = rs_gamma;
+            }
+            else
+            {
+                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
+            }
+            node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &e2InScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_inOut );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &sumZpScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp2ScaleE2 );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &sumZpScaleE2 );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
+
+            status  = vsi_nn_kernel_node_pass_param( node, node_params,
+                        _LAYERNORM_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+            vsi_nn_kernel_scalar_release( &node_params[14] );
+            vsi_nn_kernel_scalar_release( &node_params[15] );
+            vsi_nn_kernel_scalar_release( &node_params[16] );
+        }
+    }
+
+    /* Pass parameters to node. */
+final:
+    if (rs_beta)
+    {
+        vsi_nn_kernel_tensor_release( &rs_beta );
+    }
+    if (rs_gamma)
+    {
+        vsi_nn_kernel_tensor_release( &rs_gamma );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( layer_norm, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
index 272d2b0..5ccc69e 100644
--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@@ -59,6 +59,9 @@ __BEGIN_DECLS
 #define HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
     CVIVANTE_NAMESPACE("cl.gemm_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
 
+#define HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
+    CVIVANTE_NAMESPACE("cl.gemm_transb_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
+
 #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
     { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0), \
         HASH_MATRIXMUL_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \
@@ -69,6 +72,11 @@ __BEGIN_DECLS
         HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \
         SOURCE },
 
+#define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
+    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2), \
+        HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -83,6 +91,10 @@ static const struct {
     TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _3D,           KERNEL_SOURCE_1)
     TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _2D,    KERNEL_SOURCE_2)
     TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _3D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _3D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8,  F32, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8,  F32, _3D,    KERNEL_SOURCE_1)
 };
 
 /*
@@ -98,6 +110,12 @@ static vx_param_description_t _matrixmul_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
 #define _MATRIXMUL_PARAM_NUM          _cnt_of_array(_matrixmul_kernel_param_def)
@@ -130,7 +148,7 @@ DEF_KERNEL_INITIALIZER(_matrixmul_initializer)
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
 
     width = attr[0]->shape->data[0];
-    height = attr[0]->shape->data[0];
+    height = attr[0]->shape->data[1];
     chn = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
 
     gpu_param.global_scale[0]  = 1;
@@ -175,22 +193,27 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    if(depth > 1)
+    if (depth > 1)
     {
         dim_type = _3D;
     }
 
+    if (input1_dtype == I16 || input1_dtype == I32)
+    {
+        input1_dtype = I8;
+    }
+
     key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa );
 
     for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ )
     {
-        if( matrixmul_map[i].key == key )
+        if ( matrixmul_map[i].key == key )
         {
             break;
         }
     }
 
-    if( i < _cnt_of_array(matrixmul_map) )
+    if ( i < _cnt_of_array(matrixmul_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  matrixmul_map[i].function_name );
         kernel->info.parameters = _matrixmul_kernel_param_def;
@@ -223,48 +246,111 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     int32_t transposeA  = vsi_nn_kernel_param_get_int32( params, "transposeA" );
     int32_t transposeB  = vsi_nn_kernel_param_get_int32( params, "transposeB" );
+    int32_t transFlg    = 0;
     uint32_t M = inputs[0]->attr.size[1];
     uint32_t K = inputs[0]->attr.size[0];
     uint32_t N = inputs[1]->attr.size[0];
     uint32_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1;
     uint32_t ac2zero = 0;
     uint32_t bc2zero = 0;
+    float    scale_a = 1.0f;
+    float    zp_a = 0;
+    float    scale_b = 1.0f;
+    float    zp_b = 0;
+    float    scale_out = 1.0f;
+    float    zp_out = 0;
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
-    if(transposeB)
+    if (transposeB)
     {
-        return NULL;
+        N = inputs[1]->attr.size[1];
+        transFlg = 2;
     }
 
-    if(transposeA)
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        if (inputs[0]->attr.dtype.fl > 0)
+        {
+            scale_a = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl)));
+        }
+        else
+        {
+            scale_a = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl));
+        }
+        zp_a = 0;
+    }
+    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        zp_a = (float)inputs[0]->attr.dtype.zero_point;
+        scale_a = inputs[0]->attr.dtype.scale;
+    }
+
+    if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        if (inputs[1]->attr.dtype.fl > 0)
+        {
+            scale_b = (1.0f / ((float) ((int64_t)1 << inputs[1]->attr.dtype.fl)));
+        }
+        else
+        {
+            scale_b = ((float) ((int64_t)1 << -inputs[1]->attr.dtype.fl));
+        }
+        zp_b = 0;
+    }
+    else if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        zp_b = (float)inputs[1]->attr.dtype.zero_point;
+        scale_b = inputs[1]->attr.dtype.scale;
+    }
+
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        if (outputs[0]->attr.dtype.fl > 0)
+        {
+            scale_out = (float)((int64_t)1 << outputs[0]->attr.dtype.fl);
+        }
+        else
+        {
+            scale_out = (1.0f / (float)((int64_t)1 << -outputs[0]->attr.dtype.fl));
+        }
+        zp_out = 0;
+    }
+    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        zp_out = (float)outputs[0]->attr.dtype.zero_point;
+        scale_out = outputs[0]->attr.dtype.scale;
+    }
+
+    if (transposeA)
     {
         K = inputs[0]->attr.size[1];
         M = inputs[0]->attr.size[0];
+        transFlg = 1;
     }
 
-    if((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) ||
+    if ((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) ||
        (inputs[0]->attr.size[2] > inputs[1]->attr.size[2]
             && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2))
     {
         bc2zero = 1;
     }
-    else if((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) ||
+    else if ((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) ||
        (inputs[1]->attr.size[2] > inputs[0]->attr.size[2]
             && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2))
     {
         ac2zero = 1;
     }
 
-    status = _query_kernel( kernel, inputs, outputs, depth, transposeA );
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( kernel, inputs, outputs, depth, transFlg );
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 3;
             /* Pass parameters to node. */
@@ -275,6 +361,12 @@ static vsi_nn_kernel_node_t _setup
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ac2zero );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bc2zero );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_a );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_a );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_b );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_out );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_out );
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _MATRIXMUL_PARAM_NUM );
             CHECK_STATUS(status);
@@ -283,6 +375,12 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[5] );
             vsi_nn_kernel_scalar_release( &node_params[6] );
             vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
         }
     }
     return node;
diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
new file mode 100644
index 0000000..3f4402a
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
@@ -0,0 +1,329 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+#define _ROI_ALIGN_KERNEL_SOURCE(_input_type)      "roi_align"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, _image_2d ) \
+        (( IN0_DTYPE ) | ( IN1_DTYPE << 7) | (IN2_DTYPE << 14) | (OUT_DTYPE << 21) | (_image_2d << 28))
+
+#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE ) \
+        { ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, 0 ), \
+          CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"to"STR(OUT_DTYPE)), \
+          _ROI_ALIGN_KERNEL_SOURCE(IN0_DTYPE) }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _roi_align_kernel_map[] =
+{
+    PACK_KERNEL_MAP(F32, F32, I32, F32),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _roi_align_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _ROI_ALIGN_PARAM_NUM  _cnt_of_array( _roi_align_kernel_param_def )
+
+#define SCALAR_SPATIAL_X_SCALE          (4)
+#define SCALAR_SPATIAL_Y_SCALE          (5)
+#define SCALAR_INPUT_WIDTH              (6)
+#define SCALAR_INPUT_HEIGHT             (7)
+#define SCALAR_RCP_OF_OUTPUT_WIDTH      (8)
+#define SCALAR_RCP_OF_OUTPUT_HEIGHT     (9)
+#define SCALAR_SAMPLING_X_RATIO         (10)
+#define SCALAR_SAMPLING_Y_RATIO         (11)
+#define SCALAR_DEPTH                    (12)
+
+#define ROI_ALIGN_PARAM_NUM         13
+#define ROI_ALIGN_QUANT_PARAM_NUM   _cnt_of_array( _roi_align_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_roi_align_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * rois_attr     = NULL;
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_int_array_t * rois_shape                = NULL;
+    vsi_int_array_t * out_shape                 = NULL;
+
+    rois_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( rois_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    rois_shape = rois_attr->shape;
+    out_shape  = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.dim = 3;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = rois_shape->data[1];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+
+    return status;
+} /* _roi_align_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e in2_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _roi_align_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _roi_align_kernel_map );
+    vx_param_description_t * param_def  = _roi_align_kernel_param_def;
+    size_t param_def_size               = ROI_ALIGN_QUANT_PARAM_NUM;
+    vx_kernel_initialize_f  initializer = _roi_align_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    in2_dtype  = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    out_dtype  = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    in0_dtype = in0_dtype == F16 ? F32 : in0_dtype;
+    in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
+
+    key = ROI_ALIGN_HASH_KEY( in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+
+} /* _query_kernel() */
+
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ROI_ALIGN_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    uint32_t rank[_IO_NUM] = {0};
+    int32_t  shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
+    int32_t i = 0;
+    float   width_ratio         = vsi_nn_kernel_param_get_float32( params, "width_ratio" );
+    float   height_ratio        = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
+    int32_t width_sample_num    = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
+    int32_t height_sample_num   = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
+    float   width_scale         = 1.0f / width_ratio;
+    float   height_scale        = 1.0f / height_ratio;
+    float   in_width            = (float)(inputs[0]->attr.size[0]);
+    float   in_height           = (float)(inputs[0]->attr.size[1]);
+    float   rcp_of_out_width    = 1.0f / (float)(outputs[0]->attr.size[0]);
+    float   rcp_of_out_height   = 1.0f / (float)(outputs[0]->attr.size[1]);
+    float   sampling_x_ratio    = width_sample_num > 0 ? (float)width_sample_num : 0;
+    float   sampling_y_ratio    = height_sample_num > 0 ? (float)height_sample_num : 0;
+    int     depth               = inputs[0]->attr.size[2];
+
+    vsi_nn_kernel_optimize_nchw2xhw_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
+                                              shapes[0], &rank[0]);
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
+                                              shapes[1], &rank[1]);
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[2]->attr.size, inputs[2]->attr.dim_num,
+                                              shapes[2], &rank[2]);
+    vsi_nn_kernel_optimize_nchw2xhw_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num,
+                                              shapes[3], &rank[3]);
+
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
+            inputs[i], (uint32_t*)shapes[i], rank[i] );
+    }
+    reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph,
+        outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
+                inputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[_INPUT_NUM], image_2d);
+
+    if ( VSI_SUCCESS == status )
+    {
+        size_t node_params_num = ROI_ALIGN_PARAM_NUM;
+
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM,
+                reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num );
+
+            node_params[SCALAR_SPATIAL_X_SCALE]      = vsi_nn_kernel_scalar_create( graph, F32, &width_scale );
+            node_params[SCALAR_SPATIAL_Y_SCALE]      = vsi_nn_kernel_scalar_create( graph, F32, &height_scale );
+            node_params[SCALAR_INPUT_WIDTH]          = vsi_nn_kernel_scalar_create( graph, F32, &in_width );
+            node_params[SCALAR_INPUT_HEIGHT]         = vsi_nn_kernel_scalar_create( graph, F32, &in_height );
+            node_params[SCALAR_RCP_OF_OUTPUT_WIDTH]  = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_width );
+            node_params[SCALAR_RCP_OF_OUTPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_height );
+            node_params[SCALAR_SAMPLING_X_RATIO]     = vsi_nn_kernel_scalar_create( graph, F32, &sampling_x_ratio );
+            node_params[SCALAR_SAMPLING_Y_RATIO]     = vsi_nn_kernel_scalar_create( graph, F32, &sampling_y_ratio );
+            node_params[SCALAR_DEPTH]                = vsi_nn_kernel_scalar_create( graph, I32, &depth );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_X_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_Y_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_HEIGHT] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_RCP_OF_OUTPUT_WIDTH] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_RCP_OF_OUTPUT_HEIGHT] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_X_RATIO] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_Y_RATIO] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] );
+        }
+    }
+
+    for (i = 0; i < _IO_NUM; i++)
+    {
+        if (reshape_tensors[i])
+        {
+            vsi_nn_ReleaseTensor( &reshape_tensors[i] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( roi_align, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
new file mode 100644
index 0000000..b021962
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
@@ -0,0 +1,298 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define KERNEL_SOURCE_1    "space2depth_internal"
+
+#define HASH_SPACE2DEPTH_INTERNAL_KEY(_input0_type, _output_type, _opt_flg) \
+    ((_input0_type << 24) | (_output_type << 16) | (_opt_flg << 8))
+
+#define HASH_SPACE2DEPTH_INTERNAL_CL_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.space2depth_internal_"#SRC0_TYPE"to"#DST_TYPE)
+
+#define HASH_SPACE2DEPTH_INTERNAL_X2Y1_CL_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.space2depth_internal_"#SRC0_TYPE"to"#DST_TYPE"_X2Y1")
+
+#define TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_SPACE2DEPTH_INTERNAL_CL_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+ #define TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        HASH_SPACE2DEPTH_INTERNAL_X2Y1_CL_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } kernel_map[] =
+{
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(F32, F32,  KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(U8, U8, KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(F32, F32,  KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(U8, U8, KERNEL_SOURCE_1)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _CL_PARAM_NUM          _cnt_of_array(kernel_param_def)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_status    status             = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_int_array_t * in_shape = NULL;
+    int32_t width = 0;
+    int32_t height = 0;
+    int32_t chn = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    in_shape  = attr[0]->shape;
+    width = in_shape->data[0];
+    height = in_shape->data[1];
+    chn = in_shape->size > 2 ? in_shape->data[2] : 1;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = height;
+    gpu_param.global_size[2]   = chn;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+
+    return status;
+} /* _space2depth_internal_initializer() */
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    int32_t opt_flg
+    )
+{
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    vsi_status status = VSI_FAILURE;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    key = HASH_SPACE2DEPTH_INTERNAL_KEY( input0_dtype, output_dtype, opt_flg );
+
+    if (input0_dtype == F16 && output_dtype == F16)
+    {
+        input0_dtype = F32;
+        output_dtype = F32;
+    }
+
+    for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters = kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( kernel_param_def );
+        kernel->info.initialize = _space2depth_internal_initializer;
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t block_size_x  = vsi_nn_kernel_param_get_int32( params, "block_size_x" );
+    int32_t block_size_y  = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
+    int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0;
+
+    float inputScale = inputs[0]->attr.dtype.scale;
+    int32_t inputZp = inputs[0]->attr.dtype.zero_point;
+    float outputScale = outputs[0]->attr.dtype.scale;
+    int32_t outputZp = outputs[0]->attr.dtype.zero_point;
+    float scaleInOut = 1.0f;
+    float zpInOut = 0.0f;
+
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        int32_t input_fl = inputs[0]->attr.dtype.fl;
+        if (input_fl > 0)
+        {
+            inputScale = (1.0f / ((float) ((int64_t)1 << input_fl)));
+        }
+        else
+        {
+            inputScale = ((float) ((int64_t)1 << -input_fl));
+        }
+        inputZp = 0;
+    }
+    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
+    {
+        inputScale = 1.0f;
+        inputZp = 0;
+    }
+
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        int32_t output_fl = outputs[0]->attr.dtype.fl;
+        if (output_fl > 0)
+        {
+            outputScale = (1.0f / ((float) ((int64_t)1 << output_fl)));
+        }
+        else
+        {
+            outputScale = ((float) ((int64_t)1 << -output_fl));
+        }
+        outputZp = 0;
+    }
+    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
+    {
+        outputScale = 1.0f;
+        outputZp = 0;
+    }
+    scaleInOut = inputScale / outputScale;
+    zpInOut = outputZp - inputZp * scaleInOut;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, opt_flg);
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+
+        if ( node )
+        {
+            int32_t index = 2;
+            vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
+                    inputs, 1, outputs, 1 );
+            node_params[index++] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &block_size_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &block_size_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &scaleInOut );
+            node_params[index] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &zpInOut );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( space2depth_internal, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
index b1e9860..6720a14 100644
--- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
@@ -173,7 +173,7 @@ final:
         if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
     }
     return status;
-} /* _pre_process_yuv420_exec() */
+} /* _instance_norm_exec() */
 /*
  * Kernel params
  */
diff --git a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
new file mode 100644
index 0000000..d6d9802
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
@@ -0,0 +1,255 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "client/vsi_nn_vxkernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (1)
+#define _CPU_INPUT_NUM          (3)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.layer_norm")
+
+DEF_KERNEL_EXECUTOR(_layer_norm_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[_CPU_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    uint32_t i = 0;
+    float eps = .0f;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
+    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
+    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
+
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
+
+    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final );
+
+    buffer[3] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
+    memset( buffer[3], 0, out_elements * sizeof(float) );
+
+    {
+        uint32_t  axis_first = 0;
+        uint32_t  axis_num  = 1;
+        uint32_t  outerSize = 1;
+        uint32_t  axisSize  = 1;
+        uint32_t  innerSize = 1;
+        uint32_t  inner     = 0;
+        uint32_t  outer     = 0;
+
+        for (i = 0; i < (uint32_t)axis_first; i++)
+        {
+            innerSize *= attr[0]->shape->data[i];
+        }
+
+        for(i = 0; i < (uint32_t)axis_num; i++)
+        {
+            axisSize *= attr[0]->shape->data[axis_first + i];
+        }
+
+        for (i = (uint32_t)axis_first + axis_num; i < attr[0]->shape->size; i++)
+        {
+            outerSize *= attr[0]->shape->data[i];
+        }
+
+        for ( outer = 0; outer < outerSize; ++outer)
+        {
+            for ( inner = 0; inner < innerSize; ++inner)
+            {
+                float sum = .0f;
+                float sumsq = .0f;
+                float mean = .0f;
+                float vari = .0f;
+
+                for (i = 0; i < (uint32_t)axisSize; ++i)
+                {
+                    float value = buffer[0][(outer * axisSize + i) * innerSize + inner];
+                    sum += value;
+                    sumsq += (value * value);
+                }
+                mean = sum / (axisSize);
+                vari = sumsq / (axisSize) - mean * mean;
+                vari = (float)(1.0 / sqrtf(vari + eps));
+
+                for (i = 0; i < (uint32_t)axisSize; ++i)
+                {
+                    int idx = (outer * axisSize + i) * innerSize + inner;
+                    float data = buffer[0][idx] - mean;
+                    float scaleVal = buffer[2][idx];
+                    float biasVal = buffer[1][idx];
+                    float normVal = data * vari * scaleVal + biasVal;
+                    buffer[3][idx] = normVal;
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
+            buffer[3], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+    return status;
+} /* _layer_norm_exec() */
+/*
+ * Kernel params
+ */
+static vx_param_description_t _layer_normalization_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _LAYER_NORMALIZATION_PARAM_NUM  _cnt_of_array( _layer_normalization_kernel_param_def )
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    _layer_norm_exec,
+    _layer_normalization_kernel_param_def,
+    _LAYER_NORMALIZATION_PARAM_NUM,
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    vsi_nn_KernelInitializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
+                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+            backend_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            CHECK_STATUS( status );
+            vsi_nn_kernel_scalar_release( &backend_params[4] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( layer_norm, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
new file mode 100644
index 0000000..2aa18cd
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
@@ -0,0 +1,378 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.roi_align")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _roi_align_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _ROI_ALIGN_PARAM_NUM  _cnt_of_array( _roi_align_kernel_param_def )
+#define SCALAR_X_RATIO          (4)
+#define SCALAR_Y_RATIO          (5)
+#define SCALAR_X_SAMPLE         (6)
+#define SCALAR_Y_SAMPLE         (7)
+
+/*
+ * Kernel function
+ */
+static float _compute_region_coordinate(int32_t p, float bin_size, float roi_anchor, float max_value)
+{
+    const float region_start = p * bin_size + roi_anchor;
+
+    return vsi_nn_clamp(region_start, 0.0f, max_value - 1);
+}
+
+static float _roi_align_1x1(float *input_ptr,
+                           int32_t width,
+                           int32_t height,
+                           float   region_start_x,
+                           float   bin_size_x,
+                           int32_t grid_size_x,
+                           float   region_end_x,
+                           float   region_start_y,
+                           float   bin_size_y,
+                           int32_t grid_size_y,
+                           float   region_end_y)
+{
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return 0;
+    }
+    else
+    {
+        float avg = 0;
+        int32_t iy = 0;
+        int32_t ix = 0;
+        // Iterate through the aligned pooling region
+        for (iy = 0; iy < grid_size_y; ++iy)
+        {
+            for (ix = 0; ix < grid_size_x; ++ix)
+            {
+                // Align the window in the middle of every bin
+                float y = region_start_y + ((float)iy + 0.5f) * bin_size_y / (float)(grid_size_y);
+                float x = region_start_x + ((float)ix + 0.5f) * bin_size_x / (float)(grid_size_x);
+
+                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
+                const int32_t y_low  = (int32_t)y;
+                const int32_t x_low  = (int32_t)x;
+                const int32_t y_high = vsi_nn_min(y_low + 1, height - 1);
+                const int32_t x_high = vsi_nn_min(x_low + 1, width - 1);
+
+                const float ly = y - y_low;
+                const float lx = x - x_low;
+                const float hy = 1.0f - ly;
+                const float hx = 1.0f - lx;
+
+                const float w1 = hy * hx;
+                const float w2 = hy * lx;
+                const float w3 = ly * hx;
+                const float w4 = ly * lx;
+
+                const float data1 = *(input_ptr + y_low * width + x_low);
+                const float data2 = *(input_ptr + y_low * width + x_high);
+                const float data3 = *(input_ptr + y_high * width + x_low);
+                const float data4 = *(input_ptr + y_high * width + x_high);
+
+                avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+            }
+        }
+
+        avg /= grid_size_x * grid_size_y;
+
+        return avg;
+    }
+}
+
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t  i                 = 0;
+    float     width_scale       = 0.0f;
+    float     height_scale      = 0.0f;
+    float     width_ratio       = 0.0f;
+    float     height_ratio      = 0.0f;
+    int32_t   width_sample_num  = 0;
+    int32_t   height_sample_num = 0;
+    uint32_t  n                 = 0;
+    uint32_t  num_rois          = 0;
+    int32_t   inHeight          = 0;
+    int32_t   inWidth           = 0;
+    int32_t   inDepth           = 0;
+    int32_t   outHeight         = 0;
+    int32_t   outWidth          = 0;
+    uint32_t  kRoiDim           = 4;
+    uint32_t  out_index         = 0;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+
+    }
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_X_RATIO], &(width_ratio));
+    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_Y_RATIO], &(height_ratio));
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_X_SAMPLE], &(width_sample_num));
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_Y_SAMPLE], &(height_sample_num));
+
+    width_scale = 1.0f / width_ratio;
+    height_scale = 1.0f / height_ratio;
+    num_rois = in_attr[1]->shape->data[1];
+
+    inWidth = in_attr[0]->shape->data[0];
+    inHeight = in_attr[0]->shape->data[1];
+    inDepth = in_attr[0]->shape->data[2];
+    outWidth = out_attr[0]->shape->data[0];
+    outHeight = out_attr[0]->shape->data[1];
+
+    for (n = 0; n < num_rois; n++)
+    {
+        uint32_t batchId = (uint32_t)f32_in_buffer[2][n];
+        float scale = (in_attr[1]->dtype == U16) ? 0.125f : 1.0f;
+        float qx1 = f32_in_buffer[1][n * kRoiDim];
+        float qy1 = f32_in_buffer[1][n * kRoiDim + 1];
+        float qx2 = f32_in_buffer[1][n * kRoiDim + 2];
+        float qy2 = f32_in_buffer[1][n * kRoiDim + 3];
+
+        float x1 = qx1 * scale;
+        float x2 = qx2 * scale;
+        float y1 = qy1 * scale;
+        float y2 = qy2 * scale;
+        float roi_anchor_x = x1 * width_scale;
+        float roi_anchor_y = y1 * height_scale;
+        float roi_dims_x   = vsi_nn_max((x2 - x1) * width_scale, 1.0f);
+        float roi_dims_y   = vsi_nn_max((y2 - y1) * height_scale, 1.0f);
+        float bin_size_x   = roi_dims_x / outWidth;
+        float bin_size_y   = roi_dims_y / outHeight;
+
+        int32_t batch_base_index = batchId * inHeight * inWidth * inDepth;
+        int32_t ch = 0;
+        int32_t py = 0;
+        int32_t px = 0;
+
+        for (ch = 0; ch < inDepth; ch++)
+        {
+            for (py = 0; py < outHeight; py++)
+            {
+                for (px = 0; px < outWidth; px++)
+                {
+                    float region_start_x = _compute_region_coordinate(px, bin_size_x,
+                        roi_anchor_x, (float)inWidth);
+                    float region_start_y = _compute_region_coordinate(py, bin_size_y,
+                        roi_anchor_y, (float)inHeight);
+                    float region_end_x   = _compute_region_coordinate(px + 1, bin_size_x,
+                        roi_anchor_x, (float)inWidth);
+                    float region_end_y   = _compute_region_coordinate(py + 1, bin_size_y,
+                        roi_anchor_y, (float)inHeight);
+
+                    int32_t roi_bin_grid_x = (width_sample_num > 0) ? width_sample_num : (int32_t)(ceil(bin_size_x));
+                    int32_t roi_bin_grid_y = (height_sample_num > 0) ? height_sample_num : (int32_t)(ceil(bin_size_y));
+
+                    float *input_ptr = &f32_in_buffer[0][batch_base_index + ch * inWidth * inHeight];
+                    float out_val = 0;
+
+                    out_val = _roi_align_1x1(
+                        input_ptr, inWidth, inHeight, region_start_x, bin_size_x,
+                        roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
+                        roi_bin_grid_y, region_end_y);
+
+                    f32_out_buffer[0][out_index++] = out_val;
+                }
+            }
+        }
+    }
+
+    /* save data */
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _roi_align_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _roi_align_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ROI_ALIGN_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    float   width_ratio         = vsi_nn_kernel_param_get_float32( params, "width_ratio" );
+    float   height_ratio        = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
+    int32_t width_sample_num    = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
+    int32_t height_sample_num   = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_X_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &width_ratio );
+            node_params[SCALAR_Y_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &height_ratio );
+            node_params[SCALAR_X_SAMPLE] = vsi_nn_kernel_scalar_create( graph, I32, &width_sample_num );
+            node_params[SCALAR_Y_SAMPLE] = vsi_nn_kernel_scalar_create( graph, I32, &height_sample_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ROI_ALIGN_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_X_RATIO] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_Y_RATIO] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_X_SAMPLE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_Y_SAMPLE] );
+        }
+    }
+
+    return node;
+
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( roi_align, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
new file mode 100644
index 0000000..4df8a52
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
@@ -0,0 +1,230 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "client/vsi_nn_vxkernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (2)
+#define _CPU_INPUT_NUM          (1)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.space2depth_internal")
+
+DEF_KERNEL_EXECUTOR(_space2depth_internal_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[2] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    uint32_t i = 0;
+    int32_t block_size_x = 1;
+    int32_t block_size_y = 1;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size_x);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size_y);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
+    memset( buffer[1], 0, out_elements * sizeof(float) );
+
+    {
+        uint32_t output_depth = attr[1]->shape->data[2];
+        uint32_t output_height = attr[1]->shape->data[1];
+        uint32_t output_width = attr[1]->shape->data[0];
+        uint32_t input_batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
+        uint32_t input_depth = attr[0]->shape->data[2];
+        uint32_t input_height = attr[0]->shape->data[1];
+        uint32_t input_width = attr[0]->shape->data[0];
+        uint32_t batch = 0, in_h = 0, in_w = 0;
+
+        for (batch = 0; batch < input_batch; ++ batch)
+        {
+            uint32_t output_batch_index = batch * output_height * output_width * output_depth;
+            uint32_t input_batch_index = batch * input_height * input_width * input_depth;
+            uint32_t in_d = 0;
+
+            for (in_d = 0; in_d < input_depth; in_d ++)
+            {
+                for (in_h = 0; in_h < input_height; ++ in_h)
+                {
+                    for (in_w = 0; in_w < input_width; in_w ++)
+                    {
+                        uint32_t out_w = in_w / block_size_x;
+                        uint32_t out_h = in_h / block_size_y;
+                        uint32_t out_d = (in_w  % block_size_x) * input_depth
+                                            + (in_h % block_size_y) * block_size_x * input_depth + in_d;
+
+                        uint32_t in_index = in_w + in_h * input_width
+                                            + in_d * input_height * input_width + input_batch_index;
+                        uint32_t out_index = out_w + out_h * output_width
+                                            +  out_d * output_width * output_height + output_batch_index;
+
+                        buffer[1][out_index] = buffer[0][in_index];
+                    }
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer[1], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    return status;
+} /* _depth2space_crd_exec() */
+/*
+ * Kernel params
+ */
+static vx_param_description_t _space2depth_internal_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _DEPTH2SPACE_CRD_PARAM_NUM  _cnt_of_array( _space2depth_internal_kernel_param_def )
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    _space2depth_internal_exec,
+    _space2depth_internal_kernel_param_def,
+    _cnt_of_array( _space2depth_internal_kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    vsi_nn_KernelInitializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+            int32_t block_size_x  = vsi_nn_kernel_param_get_int32( params, "block_size_x" );
+            int32_t block_size_y  = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
+                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_x );
+            backend_params[index] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_y );
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            CHECK_STATUS( status );
+            vsi_nn_kernel_scalar_release( &backend_params[2] );
+            vsi_nn_kernel_scalar_release( &backend_params[3] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( space2depth_internal, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c b/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c
new file mode 100644
index 0000000..e8b49f9
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c
@@ -0,0 +1,264 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.upsamplescale")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _upsamplescale_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _UPSAMPLESCALE_PARAM_NUM  _cnt_of_array( _upsamplescale_kernel_param_def )
+
+#define SCALAR_STRIDE_VALUE          (2)
+#define SCALAR_SCALE_VALUE           (3)
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    int32_t  i = 0;
+    int32_t  stride = 0;
+    float    scale = 0.0f;
+    int32_t width = 0;
+    int32_t height = 0;
+    int32_t out_width = 0;
+    int32_t out_height = 0;
+    int32_t outerSize = 1;
+    int32_t x = 0;
+    int32_t y = 0;
+
+    /* prepare data */
+    for(i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+
+    }
+    for(i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_VALUE], &stride);
+    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_VALUE], &scale);
+
+    width = in_attr[0]->shape->data[0];
+    height = in_attr[0]->shape->data[1];
+    for (i = 2; i < (int32_t)in_attr[0]->shape->size; i++)
+    {
+        outerSize *= in_attr[0]->shape->data[i];
+    }
+
+    out_width = out_attr[0]->shape->data[0];
+    out_height = out_attr[0]->shape->data[1];
+
+    for (i = 0; i < outerSize; i++)
+    {
+        for (y = 0; y < height; y++)
+        {
+            for (x = 0; x < width; x++)
+            {
+                int32_t in_idx = i * width * height + y * width + x;
+                int32_t base_idx = i * out_width * out_height
+                    + y * stride * out_width + x * stride;
+                int32_t dx = 0;
+                int32_t dy = 0;
+                float data = f32_in_buffer[0][in_idx] * scale;
+
+                for (dy = 0; dy < stride; dy++)
+                {
+                    for (dx = 0; dx < stride; dx++)
+                    {
+                        int32_t idx = base_idx + dy * out_width + dx;
+
+                        f32_out_buffer[0][idx] = data;
+                    }
+                }
+
+            }
+        }
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _upsamplescale_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _upsamplescale_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_UPSAMPLESCALE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t stride = 0;
+    float scale = 1.0f;
+
+    stride = vsi_nn_kernel_param_get_int32(params, "stride");
+    scale = vsi_nn_kernel_param_get_float32(params, "scale");
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLESCALE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+
+            node_params[SCALAR_STRIDE_VALUE] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &stride );
+            node_params[SCALAR_SCALE_VALUE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &scale );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLESCALE_PARAM_NUM );
+
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_VALUE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( upsamplescale, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c
index 4401d24..b634604 100644
--- a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c
@@ -79,8 +79,10 @@ typedef struct
 static const _kernel_map_type _a_times_b_plus_c_kernel_map[] =
 {
     PACK_KERNEL_MAP(F16, F16,  F16,  F16),
+    PACK_KERNEL_MAP(F16, F16,  F32,  F16),
 
     PACK_KERNEL_MAP_2D(F16, F16, F16, F16),
+    PACK_KERNEL_MAP_2D(F16, F16,  F32,  F16),
 };
 
 /*
@@ -106,7 +108,7 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer)
     )
 {
 #define _PACK_A_TIMES_B_PLUS_C_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE )    \
-        (( IN1_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE))
+        (( IN2_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE))
     vsi_status status = VX_SUCCESS;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
@@ -183,6 +185,48 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer)
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
+        case _PACK_A_TIMES_B_PLUS_C_KEY( F16, F16, F32, F16 ):
+        {
+            gpu_dp_inst_t uniA_Times_B_lo_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x01010101, // BSelt
+                0x00010000, 0x00030002, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniA_Times_B_hi_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x01010101, // BSelt
+                0x00050004, 0x00070006, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniA_Times_B_lo_4x4", &uniA_Times_B_lo_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniA_Times_B_hi_4x4", &uniA_Times_B_hi_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
     default:
         break;
     }
@@ -223,13 +267,13 @@ static vsi_status _query_kernel
     vx_param_description_t * param_def  = _a_times_b_plus_c_kernel_param_def;
     size_t param_def_size               = _cnt_of_array( _a_times_b_plus_c_kernel_param_def );
     vx_kernel_initialize_f  initializer = _a_times_b_plus_c_initializer;
-    uint32_t key;
-    uint32_t i;
+    uint32_t key = 0;
+    uint32_t i = 0;
 
     in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    in1_dtype   = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
-    in2_dtype   = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
-    out_dtype   = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    in2_dtype  = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    out_dtype  = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
     key = A_TIMES_B_PLUS_C_HASH_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d);
 
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
index 0c8273e..2ef5977 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@@ -53,18 +53,34 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_GATHER_U8TOF16      CVIVANTE_NAMESPACE("evis.gather_U8toF16")
 #define VX_KERNEL_NAME_GATHER_F16TOU8      CVIVANTE_NAMESPACE("evis.gather_F16toU8")
 
+#define VX_KERNEL_NAME_GATHER_AXIS0_U8TOU8    CVIVANTE_NAMESPACE("evis.gather_U8toU8_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_I8TOI8    CVIVANTE_NAMESPACE("evis.gather_I8toI8_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_I16TOI16  CVIVANTE_NAMESPACE("evis.gather_I16toI16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOF16  CVIVANTE_NAMESPACE("evis.gather_F16toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_I8TOF16   CVIVANTE_NAMESPACE("evis.gather_I8toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_I16TOF16  CVIVANTE_NAMESPACE("evis.gather_I16toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOI8   CVIVANTE_NAMESPACE("evis.gather_F16toI8_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOI16  CVIVANTE_NAMESPACE("evis.gather_F16toI16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16   CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8   CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0")
+
 #define KERNEL_SOURCE_1    "gather"
 #define KERNEL_SOURCE_2    "gather_mix"
 
 // Add kernel hashtable here
-#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _quant_type) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_quant_type))
+#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0))
 
 #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
     { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \
         VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
         SOURCE },
 
+#define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1), \
+        VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -81,6 +97,16 @@ static const struct {
     TENSOR_GATHER_KERNELS(F16, I32, I16,       KERNEL_SOURCE_2)
     TENSOR_GATHER_KERNELS(U8, I32,  F16,       KERNEL_SOURCE_2)
     TENSOR_GATHER_KERNELS(F16, I32, U8,        KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(U8, I32,  U8,    KERNEL_SOURCE_1)
+    TENSOR_GATHER_AXIS0_KERNELS(I8, I32,  I8,    KERNEL_SOURCE_1)
+    TENSOR_GATHER_AXIS0_KERNELS(I16, I32, I16,   KERNEL_SOURCE_1)
+    TENSOR_GATHER_AXIS0_KERNELS(F16, I32, F16,   KERNEL_SOURCE_1)
+    TENSOR_GATHER_AXIS0_KERNELS(I8, I32,  F16,   KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(I16, I32, F16,   KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I8,    KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I16,   KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(U8, I32,  F16,   KERNEL_SOURCE_2)
+    TENSOR_GATHER_AXIS0_KERNELS(F16, I32, U8,    KERNEL_SOURCE_2)
 };
 
 /*
@@ -123,7 +149,7 @@ static vsi_status get_gather_tensor_reshape_size
         sizes[i] = 1;
     }
 
-    if(idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
+    if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
     {
         sizes[0] = elementCnt;
         sizes[1] = 1;
@@ -131,7 +157,7 @@ static vsi_status get_gather_tensor_reshape_size
     }
     else
     {
-        if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
         {
             sizes[0] = block_size;
             sizes[1] = elementCnt / block_size;
@@ -191,7 +217,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     src0Scale  = attr[0]->asymm.scale;
     dstZP      = attr[2]->asymm.zero_point;
     dstScale   = attr[2]->asymm.scale;
-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         if (attr[0]->dfp.fl > 0)
         {
@@ -202,12 +228,12 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
             src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
         }
     }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
     {
         src0Scale = 1;
     }
 
-    if( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         if (attr[2]->dfp.fl > 0)
         {
@@ -219,7 +245,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
         }
         dstScale = 1.0f/dstScale;
     }
-    else if( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
     {
         dstScale = 1;
     }
@@ -232,7 +258,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     }
 
     shaderParam.global_scale[0]  = 16;
-    if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
     {
         shaderParam.global_scale[0]  = 8;
     }
@@ -340,6 +366,214 @@ OnError:
     return status;
 }
 
+DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    int32_t       block_num   = 0;
+    int32_t       indices_num = 1;
+    uint32_t      input_dims1 = 0;
+    vx_uint32     i           = 0;
+    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
+    vsi_int_array_t * input1_shape = NULL;
+    int32_t     src0ZP     = 0;
+    float       src0Scale  = 0;
+    int32_t     dstZP      = 0;
+    float       dstScale   = 0;
+
+    uint32_t pack_key = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    src0ZP     = attr[0]->asymm.zero_point;
+    src0Scale  = attr[0]->asymm.scale;
+    dstZP      = attr[2]->asymm.zero_point;
+    dstScale   = attr[2]->asymm.scale;
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+        }
+        else
+        {
+            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+        }
+    }
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    {
+        src0Scale = 1;
+    }
+
+    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[2]->dfp.fl > 0)
+        {
+            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
+        }
+        else
+        {
+            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
+        }
+        dstScale = 1.0f/dstScale;
+    }
+    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    {
+        dstScale = 1;
+    }
+
+    input1_shape  = attr[1]->shape;
+    input_dims1   = (uint32_t)input1_shape->size;
+    for (i = 0; i < input_dims1; i++)
+    {
+        indices_num *= input1_shape->data[i];
+    }
+
+    shaderParam.global_scale[0]  = 4;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((indices_num + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = block_num;
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE)    \
+        (IN0_TYPE | (OUT_TYPE << 8))
+
+    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype);
+
+    {
+        uint16_t M0               = 0;
+        int32_t  postShift        = 0;
+        uint32_t multAndoutZP0[2] = {0};
+        uint32_t multAndoutZP1[2] = {0};
+        gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertFp16toU8_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniExtraCopyDpKeepinEvis_2x8 = {{
+            0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        switch( pack_key )
+        {
+        case _PACK_SELECT_KEY( U8, F16):
+        case _PACK_SELECT_KEY( I8, F16):
+        case _PACK_SELECT_KEY( I16, F16):
+            {
+                gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
+                multAndoutZP0[0] = (uint32_t)(M0);
+                multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0);
+
+                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
+                status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( F16, U8):
+        case _PACK_SELECT_KEY( F16, I8):
+        case _PACK_SELECT_KEY( F16, I16):
+            {
+                int32_t  postShift0       = 0;
+                gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
+
+                multAndoutZP1[0] = (uint32_t)(M0);
+                multAndoutZP1[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
+
+                gpu_dp_inst_update_postshfit( &uniConvertFp16toU8_2x8, postShift0 );
+                status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvertFp16toU8_2x8", &uniConvertFp16toU8_2x8 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( I16, I16):
+        case _PACK_SELECT_KEY( I8,  I8):
+        case _PACK_SELECT_KEY( U8,  U8):
+        case _PACK_SELECT_KEY( F16, F16):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtraCopyDpKeepinEvis_2x8", &uniExtraCopyDpKeepinEvis_2x8 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+#undef _PACK_SELECT_KEY
+
+    status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }
+
+    return status;
+}
+
 /*
  * Query kernel
  */
@@ -348,7 +582,8 @@ static vsi_status _query_kernel
     vsi_nn_tensor_t* const* const inputs,
     vsi_nn_tensor_t* const* const outputs,
     vsi_nn_kernel_t* kernel,
-    const vsi_nn_kernel_param_t * params
+    const vsi_nn_kernel_param_t * params,
+    int32_t axis
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -360,21 +595,28 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0 );
+    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis );
 
     for( i = 0; i < _cnt_of_array(gather_map); i ++ )
     {
-        if( gather_map[i].key == key )
+        if ( gather_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(gather_map) )
+    if ( i < _cnt_of_array(gather_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  gather_map[i].function_name );
         kernel->info.parameters = _gather_kernel_param_def;
         kernel->info.numParams = _cnt_of_array( _gather_kernel_param_def );
-        kernel->info.initialize = _gather_initializer;
+        if (axis)
+        {
+            kernel->info.initialize = _gather_axis0_initializer;
+        }
+        else
+        {
+            kernel->info.initialize = _gather_initializer;
+        }
 
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                 "vsi_nn_kernel_header",
@@ -405,26 +647,39 @@ static vsi_nn_kernel_node_t _setup
     int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
     int32_t block_num   = vsi_nn_kernel_param_get_int32( params, "block_num" );
     int32_t axis_num    = vsi_nn_kernel_param_get_int32( params, "axis_num" );
+    int32_t axis        = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t axis0_flg   = 0;
 
-    status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
-    status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
-    status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
-    if(status != VSI_SUCCESS)
+    if (axis == 0)
+    {
+        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0);
+        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
+        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0);
+        axis0_flg = 1;
+    }
+    else
+    {
+        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
+        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
+        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
+        axis0_flg = 0;
+    }
+    if (status != VSI_SUCCESS)
     {
         return NULL;
     }
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
-    status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( inputs, outputs, kernel, params, axis0_flg);
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 0;
 #define RESHAPE_DIM 2
diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
index b1f413c..b893e74 100644
--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
@@ -183,7 +183,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
 
     vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
     vsi_int_array_t * input_shape = NULL;
-    float scaleIn = 0;
+    float scaleIn = 1;
     int32_t input_zp = 0;
     vx_uint32 iter = 0;
     int32_t sumInZp = 0;
@@ -206,10 +206,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     input_shape  = attr[0]->shape;
-    input_zp     = attr[0]->asymm.zero_point;
-    scaleIn      = attr[0]->asymm.scale;
 
-    if(attr[0]->dtype == I8 || attr[0]->dtype == I16)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        input_zp     = attr[0]->asymm.zero_point;
+        scaleIn      = attr[0]->asymm.scale;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
         if (attr[0]->dfp.fl > 0)
         {
@@ -225,13 +228,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
     width = input_shape->data[0];
     height = input_shape->data[1];
     chn = attr[1]->shape->data[1];
-    if(rsFlg)
+    if (rsFlg)
     {
         height = height / chn;
     }
     iter = height * 16;
 
-    if(attr[0]->dtype == U8)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
         sumInZp = input_zp * iter * (-1);
         tmpZp1 = (-2) * input_zp;
@@ -247,11 +250,11 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
     shaderParam.local_size[1]  = 1;
     shaderParam.local_size[2]  = 1;
 
-    if(attr[0]->dtype == I8 || attr[0]->dtype == U8)
+    if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
     {
         shaderParam.global_size[0]   = (width + 255) / 256 * 16;
     }
-    else if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
     {
         shaderParam.global_size[0]   = (width + 127) / 128 * 16;
     }
@@ -261,7 +264,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
     status = vsi_nn_kernel_gpu_config( node, &shaderParam );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
-    if(attr[0]->dtype == U8)
+    if (attr[0]->dtype == U8)
     {
         gpu_dp_inst_t uniSumU8_16x1 = {{
             0x55555555, // TCfg
@@ -290,7 +293,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
-    else if(attr[0]->dtype == I8)
+    else if (attr[0]->dtype == I8)
     {
         gpu_dp_inst_t uniSumInt8_16x1 = {{
             0x55555555, // TCfg
@@ -317,7 +320,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
-    else if(attr[0]->dtype == I16)
+    else if (attr[0]->dtype == I16)
     {
         gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
             0x55555555, // TCfg
@@ -333,7 +336,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
-    else if(attr[0]->dtype == F16)
+    else if (attr[0]->dtype == F16)
     {
         gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
             0x55555555, // TCfg
@@ -384,10 +387,10 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
 
     vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
     vsi_int_array_t * input_shape = NULL;
-    float scaleIn = 0;
-    float scaleOut = 0;
-    float reScaleOut_u8 = 0;
-    float scale_inOut = 0;
+    float scaleIn = 1.0f;
+    float scaleOut = 1.0f;
+    float reScaleOut_u8 = 1.0f;
+    float scale_inOut = 1.0f;
     int32_t output_zp = 0;
     int32_t input_zp = 0;
     float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1;
@@ -407,12 +410,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     input_shape  = attr[0]->shape;
-    input_zp     = attr[0]->asymm.zero_point;
-    scaleIn      = attr[0]->asymm.scale;
-    output_zp    = attr[2]->asymm.zero_point;
-    scaleOut     = attr[2]->asymm.scale;
 
-    if(attr[0]->dtype == I8 || attr[0]->dtype == I16)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        input_zp     = attr[0]->asymm.zero_point;
+        scaleIn      = attr[0]->asymm.scale;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
         if (attr[0]->dfp.fl > 0)
         {
@@ -422,9 +426,16 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
         {
             in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
         }
+        input_zp = 0;
     }
 
-    if(attr[2]->dtype == I8 || attr[2]->dtype == I16)
+    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        output_zp    = attr[2]->asymm.zero_point;
+        scaleOut     = attr[2]->asymm.scale;
+        reScaleOut_u8 = 1 / scaleOut;
+    }
+    else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
         if (attr[2]->dfp.fl > 0)
         {
@@ -434,10 +445,11 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
         {
             out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
         }
+        output_zp = 0;
     }
 
-    if((attr[2]->dtype == I8 || attr[2]->dtype == I16)
-        && (attr[0]->dtype == I8 || attr[0]->dtype == I16))
+    if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+        && (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP))
     {
         inOut_fl_scale = in_scale_fl * out_scale_fl;
     }
@@ -445,21 +457,17 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
     width = input_shape->data[0];
     height = input_shape->data[1];
     chn = attr[1]->shape->data[1];
-    if(rsFlg)
+    if (rsFlg)
     {
         height = height / chn;
     }
 
-    if(attr[2]->dtype == U8)
-    {
-        reScaleOut_u8 = 1 / scaleOut;
-    }
     dimRatio = (float)(1.0 / (width * height));
 
     group_num = (width + 255) / 256;
 
     shaderParam.global_scale[0]  = 16;
-    if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
     {
         shaderParam.global_scale[0]  = 8;
         group_num = (width + 127) / 128;
@@ -774,12 +782,12 @@ static vsi_status _query_kernel
 
     for( i = 0; i < kernel_map_size; i ++ )
     {
-        if( kernel_map[i].key == hashkey )
+        if ( kernel_map[i].key == hashkey )
         {
             break;
         }
     }
-    if( i < kernel_map_size )
+    if ( i < kernel_map_size )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters  = param_def;
@@ -830,7 +838,7 @@ static vsi_nn_kernel_node_t _setup
     int32_t reshape_flg  = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
 
     // Check if gpu can support the size
-    if( !vsi_nn_kernel_gpu_check_shape(
+    if ( !vsi_nn_kernel_gpu_check_shape(
         (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
     {
         return NULL;
@@ -850,7 +858,7 @@ static vsi_nn_kernel_node_t _setup
 
     attr.size[0] = ((inputs[0]->attr.size[0] + 255) / 256) * 4;
 
-    if( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
         || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
     {
         attr.size[0] = ((inputs[0]->attr.size[0] + 127) / 128) * 4;
@@ -868,17 +876,17 @@ static vsi_nn_kernel_node_t _setup
     hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );
 
     status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
-    if( VSI_SUCCESS != status )
+    if ( VSI_SUCCESS != status )
     {
         goto final;
     }
     status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
-    if( VSI_SUCCESS != status )
+    if ( VSI_SUCCESS != status )
     {
         goto final;
     }
 
-    if(reshape_flg)
+    if (reshape_flg)
     {
         int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
         shape[0] = inputs[0]->attr.size[0];
@@ -893,7 +901,7 @@ static vsi_nn_kernel_node_t _setup
         shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
         rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
     }
-    if(inputs[1]->attr.dim_num < 2)
+    if (inputs[1]->attr.dim_num < 2)
     {
         int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
         shape[0] = inputs[1]->attr.size[0];
@@ -902,7 +910,7 @@ static vsi_nn_kernel_node_t _setup
         shape[3] = 1;
         rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
     }
-    if(inputs[2]->attr.dim_num < 2)
+    if (inputs[2]->attr.dim_num < 2)
     {
         int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
         shape[0] = inputs[2]->attr.size[0];
@@ -914,10 +922,10 @@ static vsi_nn_kernel_node_t _setup
     // Mean Vari
     {
         tmp_node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
-        if(tmp_node)
+        if (tmp_node)
         {
             uint32_t index = 0;
-            if(reshape_flg)
+            if (reshape_flg)
             {
                 mean_vari_node_params[index++] = rs_input;
                 vsi_nn_kernel_node_pack_io( &mean_vari_node_params[index],
@@ -943,7 +951,7 @@ static vsi_nn_kernel_node_t _setup
                 border.mode = VX_BORDER_CONSTANT;
                 border.constant_value.U8 = 0;
                 border.constant_value.U16 = 0;
-                if(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
                 {
                     border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
                 }
@@ -956,10 +964,10 @@ static vsi_nn_kernel_node_t _setup
     // Nomalization
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if(node)
+        if (node)
         {
             uint32_t index = 0;
-            if(reshape_flg)
+            if (reshape_flg)
             {
                 node_params[index++] = rs_input;
             }
@@ -967,7 +975,7 @@ static vsi_nn_kernel_node_t _setup
             {
                 node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
             }
-            if(inputs[1]->attr.dim_num < 2)
+            if (inputs[1]->attr.dim_num < 2)
             {
                 node_params[index++] = rs_beta;
             }
@@ -975,7 +983,7 @@ static vsi_nn_kernel_node_t _setup
             {
                 node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
             }
-            if(inputs[2]->attr.dim_num < 2)
+            if (inputs[2]->attr.dim_num < 2)
             {
                 node_params[index++] = rs_gamma;
             }
@@ -984,7 +992,7 @@ static vsi_nn_kernel_node_t _setup
                 node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
             }
             node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
-            if(reshape_flg)
+            if (reshape_flg)
             {
                 node_params[index++] = rs_output;
             }
@@ -1006,9 +1014,9 @@ static vsi_nn_kernel_node_t _setup
                 border.mode = VX_BORDER_CONSTANT;
                 border.constant_value.U8 = 0;
                 border.constant_value.U16 = 0;
-                if(outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
                 {
-                    border.constant_value.U8 = (vx_uint8)outputs[0]->attr.dtype.zero_point;
+                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
                 }
                 status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
                 CHECK_STATUS(status);
@@ -1018,31 +1026,31 @@ static vsi_nn_kernel_node_t _setup
 
     /* Pass parameters to node. */
 final:
-    if(rs_beta)
+    if (rs_beta)
     {
         vsi_nn_kernel_tensor_release( &rs_beta );
     }
-    if(rs_gamma)
+    if (rs_gamma)
     {
         vsi_nn_kernel_tensor_release( &rs_gamma );
     }
-    if(reshape_flg)
+    if (reshape_flg)
     {
         vsi_nn_kernel_tensor_release( &rs_input );
         vsi_nn_kernel_tensor_release( &rs_output );
     }
     for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
     {
-        if( ikernels[i] )
+        if ( ikernels[i] )
         {
             vsi_nn_kernel_release( &ikernels[i] );
         }
-        if( tensors[i] )
+        if ( tensors[i] )
         {
             vsi_nn_ReleaseTensor( &tensors[i] );
         }
     }
-    if(tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
+    if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
new file mode 100644
index 0000000..238eb23
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@@ -0,0 +1,1389 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+ typedef enum
+{
+    LAYERNORM_KERNEL,
+    LAYERNORM_2D_KERNEL,
+    SUMSQR_KERNEL,
+    SUMSQR_2D_KERNEL,
+    LAYERNORM_WH_KERNEL,
+    LAYERNORM_WH_2D_KERNEL,
+} _kernel_type_e;
+
+#define KERNEL_SOURCE_1    "layer_normalization"
+#define KERNEL_SOURCE_2    "layer_normalization_2d"
+#define KERNEL_SOURCE_3    "layer_normalization_u8_f16"
+#define KERNEL_SOURCE_4    "layer_normalization_wh_u8"
+#define KERNEL_SOURCE_5    "layer_normalization_wh_f16"
+#define KERNEL_SOURCE_6    "layer_normalization_i16"
+#define KERNEL_SOURCE_7    "layer_normalization_wh_i16"
+
+
+#define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE)
+
+#define HASH_LAYERNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
+
+// normalization
+#define HASH_LAYERNORM_KEY(_input0_type, _output_type, _reshape_flag) \
+    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
+
+#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_KERNEL), \
+        HASH_LAYERNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_LAYERNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_2D_KERNEL), \
+        HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+// greater than max size
+#define HASH_SUMSQR_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE)
+
+#define HASH_SUMSQR_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE"_2D")
+
+#define HASH_LAYERNORM_WH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE)
+
+#define HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE"_2D")
+
+#define TENSOR_SUMSQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_KERNEL), \
+        HASH_SUMSQR_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_SUMSQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_2D_KERNEL), \
+        HASH_SUMSQR_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_LAYERNORM_WH_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_KERNEL), \
+        HASH_LAYERNORM_WH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_LAYERNORM_WH_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \
+        HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _layernorm_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_LAYERNORM_KERNELS( U8, U8, KERNEL_SOURCE_1 )
+    TENSOR_LAYERNORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 )
+    TENSOR_LAYERNORM_KERNELS( U8, F16, KERNEL_SOURCE_3 )
+    TENSOR_LAYERNORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_3 )
+
+    TENSOR_LAYERNORM_KERNELS( F16, F16, KERNEL_SOURCE_1 )
+    TENSOR_LAYERNORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_2 )
+    TENSOR_LAYERNORM_KERNELS( F16, U8, KERNEL_SOURCE_1 )
+    TENSOR_LAYERNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_2 )
+    TENSOR_LAYERNORM_KERNELS( I16, I16, KERNEL_SOURCE_6 )
+    TENSOR_LAYERNORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 )
+};
+
+static const _kernel_map_type _sumsqr_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_SUMSQR_KERNELS( U8, F32, KERNEL_SOURCE_4 )
+    TENSOR_SUMSQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_4 )
+    TENSOR_SUMSQR_KERNELS( F16, F32, KERNEL_SOURCE_5 )
+    TENSOR_SUMSQR_KERNELS_2D( F16, F32, KERNEL_SOURCE_5 )
+    TENSOR_SUMSQR_KERNELS( I16, F32, KERNEL_SOURCE_7 )
+    TENSOR_SUMSQR_KERNELS_2D( I16, F32, KERNEL_SOURCE_7 )
+
+    TENSOR_LAYERNORM_WH_KERNELS( U8, U8, KERNEL_SOURCE_4 )
+    TENSOR_LAYERNORM_WH_KERNELS_2D( U8, U8, KERNEL_SOURCE_4 )
+    TENSOR_LAYERNORM_WH_KERNELS( U8, F16, KERNEL_SOURCE_4 )
+    TENSOR_LAYERNORM_WH_KERNELS_2D( U8, F16, KERNEL_SOURCE_4 )
+    TENSOR_LAYERNORM_WH_KERNELS( F16, F16, KERNEL_SOURCE_5 )
+    TENSOR_LAYERNORM_WH_KERNELS_2D( F16, F16, KERNEL_SOURCE_5 )
+    TENSOR_LAYERNORM_WH_KERNELS( I16, I16, KERNEL_SOURCE_7 )
+    TENSOR_LAYERNORM_WH_KERNELS_2D( I16, I16, KERNEL_SOURCE_7 )
+};
+
+/*
+ * Kernel params
+ */
+
+static vx_param_description_t _layernorm_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _sumSqr_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _layernorm_wh_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+#define _LAYERNORM_PARAM_NUM    _cnt_of_array( _layernorm_kernel_param_def )
+#define _SUMSQR_PARAM_NUM       _cnt_of_array( _sumSqr_kernel_param_def )
+#define _LAYERNORM_WH_PARAM_NUM    _cnt_of_array( _layernorm_wh_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+
+DEF_KERNEL_INITIALIZER(_layernorm_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
+    vsi_int_array_t * input_shape = NULL;
+    float scaleIn = 1;
+    float scaleOut = 1;
+    float output_zp = 0;
+    int32_t input_zp = 0;
+    int32_t iter = 0;
+    int32_t sumInZp = 0;
+    int32_t tmpZp1 = 0;
+    int32_t tmpZp2 = 0;
+    float e2InScale = 0;
+    int32_t height = 0, width = 0, chn = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+
+    input_shape  = attr[0]->shape;
+
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        input_zp     = attr[0]->asymm.zero_point;
+        scaleIn      = attr[0]->asymm.scale;
+    }
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+        }
+        else
+        {
+            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+        }
+        input_zp = 0;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
+    {
+        scaleIn = 1;
+        input_zp = 0;
+    }
+
+    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        output_zp    = (float)attr[2]->asymm.zero_point;
+        scaleOut     = 1.0f / attr[2]->asymm.scale;
+    }
+    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[2]->dfp.fl > 0)
+        {
+            scaleOut = (float)((int64_t)1 << attr[2]->dfp.fl);
+        }
+        else
+        {
+            scaleOut = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
+        }
+        output_zp = 0;
+    }
+    else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE)
+    {
+        scaleOut = 1;
+        output_zp = 0.0f;
+    }
+
+    width = input_shape->data[0];
+    height = input_shape->data[1];
+    chn = (input_shape->size <= 2) ? 1 : input_shape->data[2];
+
+    iter = ((width + 15) / 16) * 16;
+    sumInZp = input_zp * iter * (-1);
+    tmpZp1 = (-2) * input_zp;
+    tmpZp2 = iter * input_zp * input_zp;
+    e2InScale = scaleIn * scaleIn;
+
+    shaderParam.global_scale[0]  = width;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = 1;
+    shaderParam.global_size[1]   = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
+                                        / shaderParam.global_scale[1], 4);
+    shaderParam.global_size[2]   = chn;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        float dimRatio = 1.0f / (float)width;
+        float dimRatio_scale = dimRatio * scaleIn;
+        gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0x76543210, // ABin
+            0x5555aaaa, // BSelt
+            0x00000000, 0x76543210, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractHalf4_dp4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSumU8_16x1 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0xfedcba98, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSqrSum_16x1 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0xfedcba98, // ABin
+            0x55555555, // BSelt
+            0x76543210, 0xfedcba98, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert3rdUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00090008, 0x000b000a, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert4thUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x000d000c, 0x000f000e, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t UniPackFP16even_2x8 = {{
+           0x11111111, // TCfg
+           0x11110000, // ASelt
+           0x06040200, 0x06040200, // ABin
+           0x22222222, // BSelt
+           0x00000000, 0x00000000, // BBin
+           0x00000100, // AccumType, ConstantType, and PostShift
+           0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0x76543210, // ABin
+            0x5555aaaa, // BSelt
+            0x00000000, 0x76543210, // BBin
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        uint32_t pack_key      = 0;
+#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
+        (IN0_TYPE | (IN1_TYPE << 16) | (OUT_TYPE << 8))
+
+        pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype );
+
+        status  = vsi_nn_kernel_gpu_add_param(node, "width", &width);
+        status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
+        status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        switch( pack_key )
+        {
+            case _PACK_SELECT_KEY( U8, F16, F16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "UniPackFP16even_2x8",
+                        &UniPackFP16even_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
+                        &uniConvert3rdUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
+                        &uniConvert4thUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( U8,  F16, U8 ):
+            case _PACK_SELECT_KEY( F16, F16, F16 ):
+            case _PACK_SELECT_KEY( F16, F16, U8 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2",
+                        &uniFp16SumSqr_dp8x2);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_dp4x4",
+                        &uniExtractHalf4_dp4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
+                        &uniConvertInt32toUint8_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
+                        &uniConvert3rdUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
+                        &uniConvert4thUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( I16, F16, I16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2",
+                        &uniInt16SumSqr_dp8x2);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
+                        &uniConvertInt32toUint8_2x8);
+
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio_scale", &dimRatio_scale);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            default:
+                VSI_ASSERT( FALSE );
+                return VSI_FAILURE;
+        }
+#undef _PACK_SELECT_KEY
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }
+
+    return status;
+}
+
+DEF_KERNEL_INITIALIZER(_sumsqr_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    vsi_int_array_t * input_shape = NULL;
+    float scaleIn = 1.0f;
+    int32_t input_zp = 0;
+    vx_uint32 iter = 0;
+    int32_t sumInZp = 0;
+    int32_t tmpZp1 = 0;
+    float tmpZp2 = 0;
+    float e2InScale = 0;
+    float rowSumScale = 0;
+    int32_t width = 0;
+    int32_t height = 0;
+    int32_t chn = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    input_shape  = attr[0]->shape;
+
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        input_zp     = attr[0]->asymm.zero_point;
+        scaleIn      = attr[0]->asymm.scale;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+        }
+        else
+        {
+            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+        }
+        input_zp = 0;
+    }
+
+    width = input_shape->data[0];
+    height = input_shape->data[1];
+    chn = attr[1]->shape->data[1];
+    iter = height * 16;
+
+    e2InScale = scaleIn * scaleIn;
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        sumInZp = input_zp * iter * (-1);
+        tmpZp1 = (-2) * input_zp;
+        tmpZp2 = input_zp * input_zp * e2InScale;
+        rowSumScale = height * 16 * tmpZp2;
+    }
+
+    shaderParam.global_scale[0]  = 1;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.local_size[0]  = 16;
+    shaderParam.local_size[1]  = 1;
+    shaderParam.local_size[2]  = 1;
+
+    if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
+    {
+        shaderParam.global_size[0]   = (width + 255) / 256 * 16;
+    }
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    {
+        shaderParam.global_size[0]   = (width + 127) / 128 * 16;
+    }
+    shaderParam.global_size[1]   = chn;
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    if (attr[0]->dtype == U8)
+    {
+        gpu_dp_inst_t uniSumU8_16x1 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0xfedcba98, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSqrSum_16x1 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0xfedcba98, // ABin
+            0x55555555, // BSelt
+            0x76543210, 0xfedcba98, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+        status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+    else if (attr[0]->dtype == F16)
+    {
+        gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0x76543210, // ABin
+            0x5555aaaa, // BSelt
+            0x00000000, 0x76543210, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+    else if (attr[0]->dtype == I16)
+    {
+        gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0x76543210, // ABin
+            0x5555aaaa, // BSelt
+            0x00000000, 0x76543210, // BBin
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+        status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+    status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
+    status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+}
+
+DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
+    vsi_int_array_t * input_shape = NULL;
+    float scaleIn = 1.0f;
+    float scaleOut = 1.0f;
+    float output_zp = 0;
+    int32_t input_zp = 0;
+    float dimRatio = 0;
+    vx_uint32 group_num = 0;
+    vx_int32 height = 0, width = 0, chn = 0, height_chn_org = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+
+    input_shape  = attr[0]->shape;
+
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        input_zp     = attr[0]->asymm.zero_point;
+        scaleIn      = attr[0]->asymm.scale;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+        }
+        else
+        {
+            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+        }
+        input_zp = 0;
+    }
+
+    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        output_zp    = (float)attr[2]->asymm.zero_point;
+        scaleOut     = 1.0f / attr[2]->asymm.scale;
+    }
+    else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[2]->dfp.fl > 0)
+        {
+            scaleOut = (float)((int64_t)1 << attr[2]->dfp.fl);
+        }
+        else
+        {
+            scaleOut = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
+        }
+        output_zp = 0;
+    }
+
+    width = input_shape->data[0];
+    height = input_shape->data[1];
+    chn = attr[1]->shape->data[1];
+    height_chn_org = (input_shape->size > 2 ? input_shape->data[2] : 1) / chn;
+
+    dimRatio = (float)(1.0 / (width * height));
+
+    group_num = (width + 255) / 256;
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    {
+        group_num = (width + 127) / 128;
+    }
+
+    shaderParam.global_scale[0]  = 8;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+                                        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = (chn + shaderParam.global_scale[1] - 1)
+        / shaderParam.global_scale[1];
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        uint32_t pack_key      = 0;
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
+        (IN0_TYPE | (OUT_TYPE << 8))
+
+        pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype );
+
+        status  = vsi_nn_kernel_gpu_add_param(node, "height", &height);
+        status |= vsi_nn_kernel_gpu_add_param(node, "height_depth", &height_chn_org);
+        status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
+        status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num);
+        status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        switch( pack_key )
+        {
+            case _PACK_SELECT_KEY( U8, U8 ):
+            case _PACK_SELECT_KEY( U8, F16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
+                        &uniConvertHalfToFp16_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( F16, F16 ):
+            case _PACK_SELECT_KEY( F16, U8 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
+                        &uniConvertHalfToFp16_2x8);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( I16, I16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            default:
+                VSI_ASSERT( FALSE );
+                return VSI_FAILURE;
+        }
+#undef _PACK_SELECT_KEY
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }
+
+    return status;
+}
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    int32_t reshape2D
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+    _kernel_type_e kernel_type = LAYERNORM_KERNEL;
+
+    if (reshape2D)
+    {
+        kernel_type = LAYERNORM_2D_KERNEL;
+    }
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, kernel_type );
+
+    for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ )
+    {
+        if ( _layernorm_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(_layernorm_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _layernorm_kernel_map[i].function_name );
+        kernel->info.parameters = _layernorm_kernel_param_def;
+        kernel->info.numParams = _LAYERNORM_PARAM_NUM;
+        kernel->info.initialize = _layernorm_initializer;
+
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                _layernorm_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _layernorm_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_status _query_kernel_wh
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel_sumSqr,
+    vsi_nn_kernel_t* kernel,
+    _kernel_type_e is2D_sumsqr,
+    _kernel_type_e is2D_wh
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_LAYERNORM_KEY( input0_dtype, F32, is2D_sumsqr );
+
+    for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ )
+    {
+        if ( _sumsqr_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(_sumsqr_kernel_map) )
+    {
+        snprintf( kernel_sumSqr->info.name, VX_MAX_KERNEL_NAME, "%s",  _sumsqr_kernel_map[i].function_name );
+        kernel_sumSqr->info.parameters = _sumSqr_kernel_param_def;
+        kernel_sumSqr->info.numParams = _SUMSQR_PARAM_NUM;
+        kernel_sumSqr->info.initialize = _sumsqr_initializer;
+
+        vsi_nn_kernel_add_source( kernel_sumSqr, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                _sumsqr_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_sumSqr, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _sumsqr_kernel_map[i].source_name );
+    }
+
+
+    key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, is2D_wh );
+
+    for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ )
+    {
+        if ( _sumsqr_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(_sumsqr_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _sumsqr_kernel_map[i].function_name );
+        kernel->info.parameters = _layernorm_wh_kernel_param_def;
+        kernel->info.numParams = _LAYERNORM_WH_PARAM_NUM;
+        kernel->info.initialize = _layernorm_wh_initializer;
+
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                _sumsqr_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _sumsqr_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel_wh() */
+
+static vsi_nn_kernel_node_t _setup_wh
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
+    vsi_nn_kernel_node_param_t sumSqr_node_params[_SUMSQR_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t node_params[_LAYERNORM_WH_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t tmp_node = NULL;
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_attr_t attr;
+    _kernel_type_e is2D_sumsqr = SUMSQR_2D_KERNEL;
+    _kernel_type_e is2D_wh = LAYERNORM_WH_2D_KERNEL;
+    vsi_nn_kernel_t * kernel_sumSqr = NULL;
+    vsi_nn_tensor_t * tensor_sumSqr = NULL;
+    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+
+    int32_t axis[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t axis_num  = 1;
+    int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }};
+    uint32_t axis_size = 0;
+    uint32_t rank_in = 0, rank_para = 0;
+    uint32_t outer_size = 1;
+    uint32_t i = 0;
+
+    for(i = 1; i < inputs[0]->attr.dim_num; i++)
+    {
+        outer_size *= inputs[0]->attr.size[i];
+    }
+
+    status = vsi_nn_kernel_optimize_tensor_shape(
+        (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num,
+        axis, axis_num, new_shape[0], &rank_in, new_axis, &axis_size);
+    if ( status == FALSE || axis_size > 2)
+    {
+        return NULL;
+    }
+
+    status = vsi_nn_kernel_optimize_tensor_shape(
+        (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num,
+        axis, axis_num, new_shape[1], &rank_para, new_axis, &axis_size);
+    if ( status == FALSE || axis_size > 2)
+    {
+        return NULL;
+    }
+
+    rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], rank_in);
+
+    rs_beta = vsi_nn_kernel_tensor_reshape(inputs[1]->t, new_shape[1], rank_para);
+
+    rs_gamma = vsi_nn_kernel_tensor_reshape(inputs[2]->t, new_shape[1], rank_para);
+
+    rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[0], rank_in);
+
+    if (rank_in > 2)
+    {
+        is2D_sumsqr = SUMSQR_KERNEL;
+        is2D_wh = LAYERNORM_WH_KERNEL;
+    }
+
+    kernel_sumSqr = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+    // Assign unique_id
+    kernel_sumSqr->unique_id = kernel->unique_id;
+
+    memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    attr.is_const = FALSE;
+    attr.vtl = TRUE;
+    attr.size[0] = ((new_shape[0][0] + 255) / 256) * 4;
+
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
+        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
+    {
+        attr.size[0] = ((new_shape[0][0] + 127) / 128) * 4;
+    }
+    attr.size[1] = outer_size;
+    attr.size[2] = 1;
+    attr.size[3] = 1;
+    attr.dim_num = 4;
+    tensor_sumSqr = vsi_nn_CreateTensor( graph, &attr );
+
+    status = _query_kernel_wh(inputs, outputs, kernel_sumSqr, kernel, is2D_sumsqr, is2D_wh);
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
+    {
+        tmp_node = vsi_nn_kernel_create_node( graph, kernel_sumSqr );
+        if (tmp_node)
+        {
+            sumSqr_node_params[0] = rs_input;
+            sumSqr_node_params[1] = (vsi_nn_kernel_node_param_t)tensor_sumSqr->t;
+
+            status  = vsi_nn_kernel_node_pass_param( tmp_node, sumSqr_node_params,
+                        _SUMSQR_PARAM_NUM );
+            CHECK_STATUS(status);
+            {
+                // Set default border mode.
+                vx_border_t border;
+                border.mode = VX_BORDER_CONSTANT;
+                border.constant_value.U8 = 0;
+                border.constant_value.U16 = 0;
+                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+                {
+                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                }
+                status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
+                CHECK_STATUS(status);
+            }
+        }
+    }
+
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if (node)
+        {
+            uint32_t index = 0;
+            node_params[index++] = rs_input;
+            node_params[index++] = rs_beta;
+            node_params[index++] = rs_gamma;
+            node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_sumSqr->t;
+            node_params[index++] = rs_output;
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+
+            status  = vsi_nn_kernel_node_pass_param( node, node_params,
+                        _LAYERNORM_WH_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            {
+                // Set default border mode.
+                vx_border_t border;
+                border.mode = VX_BORDER_CONSTANT;
+                border.constant_value.U8 = 0;
+                border.constant_value.U16 = 0;
+                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+                {
+                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                }
+                status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+                CHECK_STATUS(status);
+            }
+        }
+    }
+
+final:
+    if (rs_beta)
+    {
+        vsi_nn_kernel_tensor_release( &rs_beta );
+    }
+    if (rs_gamma)
+    {
+        vsi_nn_kernel_tensor_release( &rs_gamma );
+    }
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    if ( kernel_sumSqr )
+    {
+        vsi_nn_kernel_release( &kernel_sumSqr );
+    }
+    if ( tensor_sumSqr )
+    {
+        vsi_nn_ReleaseTensor( &tensor_sumSqr );
+    }
+    if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
+
+    return node;
+}
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
+    int32_t rs_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
+    int32_t wh_flg = vsi_nn_kernel_param_get_int32( params, "wh_flg" );
+    int32_t optFlg = rs_flg || (outputs[0]->attr.dim_num < 3);
+    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+
+    if (wh_flg)
+    {
+       node = _setup_wh(graph, inputs, input_num, outputs, output_num, params, kernel);
+       goto final;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, optFlg);
+    if (VSI_SUCCESS != status)
+    {
+        goto final;
+    }
+
+    if (rs_flg)
+    {
+        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
+        shape[0] = inputs[0]->attr.size[0];
+        shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2];
+        shape[2] = 1;
+        shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+        rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
+
+        shape[0] = outputs[0]->attr.size[0];
+        shape[1] = outputs[0]->attr.size[1] * outputs[0]->attr.size[2];
+        shape[2] = 1;
+        shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
+        rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
+    }
+    if (inputs[1]->attr.dim_num < 2)
+    {
+        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
+        shape[0] = inputs[1]->attr.size[0];
+        shape[1] = 1;
+        shape[2] = 1;
+        shape[3] = 1;
+        rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
+    }
+    if (inputs[2]->attr.dim_num < 2)
+    {
+        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
+        shape[0] = inputs[2]->attr.size[0];
+        shape[1] = 1;
+        shape[2] = 1;
+        shape[3] = 1;
+        rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 );
+    }
+
+    // Nomalization
+    node = vsi_nn_kernel_create_node( graph, kernel );
+    if (node)
+    {
+        uint32_t index = 0;
+        if (rs_flg)
+        {
+            node_params[index++] = rs_input;
+        }
+        else
+        {
+            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
+        }
+        if (inputs[1]->attr.dim_num < 2)
+        {
+            node_params[index++] = rs_beta;
+        }
+        else
+        {
+            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
+        }
+        if (inputs[2]->attr.dim_num < 2)
+        {
+            node_params[index++] = rs_gamma;
+        }
+        else
+        {
+            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
+        }
+        if (rs_flg)
+        {
+            node_params[index++] = rs_output;
+        }
+        else
+        {
+            node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
+        }
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+
+        status  = vsi_nn_kernel_node_pass_param( node, node_params,
+            _LAYERNORM_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &node_params[4] );
+        {
+            // Set default border mode.
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+            border.constant_value.U8 = 0;
+            border.constant_value.U16 = 0;
+            if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+            {
+                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+            }
+            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+
+    /* Pass parameters to node. */
+final:
+    if (rs_beta)
+    {
+        vsi_nn_kernel_tensor_release( &rs_beta );
+    }
+    if (rs_gamma)
+    {
+        vsi_nn_kernel_tensor_release( &rs_gamma );
+    }
+    if (rs_flg)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( layer_norm, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
index e46ea14..42ff180 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
@@ -68,7 +68,6 @@ static const struct {
 {
     TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_1)
     TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_1)
-    TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8,  SCALE_NHWC,   KERNEL_SOURCE_2)
 };
 
 static vx_param_description_t vxPreProcessBgraKernel_param_def[] =
@@ -106,7 +105,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
     int32_t     dstZP      = 0;
     float       outputScale   = 1;
     int32_t     reorder    = 0;
-    int32_t     trans      = 0;
     int32_t     xRatio     = 0;
     int32_t     yRatio     = 0;
     int32_t     order1     = 2;
@@ -126,8 +124,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
     dstZP      = attr[0]->asymm.zero_point;
@@ -135,19 +131,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
     width      = out_shape->data[0];
     height     = out_shape->data[1];
 
-    if(trans)
-    {
-        width = width / 3;
-    }
-
-    if(reorder != 0)
+    if (reorder != 0)
     {
         reorder = 2;
         order1 = 0;
     }
     enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));
 
-    if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
         if (attr[0]->dfp.fl > 0)
         {
@@ -159,11 +150,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
         }
         dstZP = 0;
     }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
         outputScale = 1.0f/outputScale;
     }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
     {
         outputScale = 1;
         dstZP = 0;
@@ -286,16 +277,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
             0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
 
-        gpu_dp_inst_t uniExtractInt32BgraToU8Bgr_2x8 = {{
-            0x00333333, // TCfg
-            0x00111000, // ASelt
-            0x00020100, 0x00000201, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
         // copy
         gpu_dp_inst_t uniExtractBfromBgra_4x4 = {{
             0x01010101, // TCfg
@@ -355,23 +336,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
             0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
         }, GPU_DP_TYPE_16 };
 
-        if(trans)
-        {
-            status = vsi_nn_kernel_gpu_add_param(node, "uniExtractInt32BgraToU8Bgr_2x8",
-                        &uniExtractInt32BgraToU8Bgr_2x8);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp1BgraShort_4x4", &uniBilinearTmp1BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp2BgraShort_4x4", &uniBilinearTmp2BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp3BgraShort_4x4", &uniBilinearTmp3BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp4BgraShort_4x4", &uniBilinearTmp4BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp5BgraShort_4x4", &uniBilinearTmp5BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp6BgraShort_4x4", &uniBilinearTmp6BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp7BgraShort_4x4", &uniBilinearTmp7BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp8BgraShort_4x4", &uniBilinearTmp8BgraShort_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniDescaleU8_4x4", &uniDescaleU8_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
-            CHECK_STATUS_FAIL_GOTO(status, OnError);
-        }
-        else if(enable_copy)
+        if (enable_copy)
         {
             status = vsi_nn_kernel_gpu_add_param(node, "uniExtractBfromBgra_4x4", &uniExtractBfromBgra_4x4);
             status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGfromBgra_4x4", &uniExtractGfromBgra_4x4);
@@ -429,16 +394,11 @@ static vsi_status _query_kernel
     uint32_t key = 0;
     int i = 0;
     vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
-    vsi_bool enable_perm  = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if(enable_perm)
-    {
-        convert_type = SCALE_NHWC;
-    }
-    else if(enable_copy)
+    if (enable_copy)
     {
         convert_type = COPY;
     }
@@ -449,14 +409,14 @@ static vsi_status _query_kernel
 
     key = HASH_PRE_PROCESS_BGRA_KEY( input0_dtype, output_dtype, convert_type, 0 );
 
-    for( i = 0; i < _cnt_of_array(pre_process_bgra_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(pre_process_bgra_map); i ++ )
     {
         if( pre_process_bgra_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(pre_process_bgra_map) )
+    if ( i < _cnt_of_array(pre_process_bgra_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_bgra_map[i].function_name );
         kernel->info.parameters = vxPreProcessBgraKernel_param_def;
@@ -488,19 +448,19 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     int32_t shapes[VSI_NN_MAX_DIM_NUM]  = {1, 1, 1, 1};
     vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
-    int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
+    int32_t trans = 0;
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
     status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 2;
             int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
index 6976058..8ce0467 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
@@ -43,7 +43,6 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8")
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI8     CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI8")
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOU8      CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toU8")
-#define VX_KERNEL_NAME_PRE_PROCESS_NV12_TRANS_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_nv12_trans_U8toU8")
 
 // greater than a quarter
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8_GQ  CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8_gq")
@@ -51,7 +50,6 @@ __BEGIN_DECLS
 
 #define KERNEL_SOURCE_1    "pre_process_nv12_scale_8bits",
 #define KERNEL_SOURCE_2    "pre_process_nv12_scale",
-#define KERNEL_SOURCE_3    "pre_process_nv12_trans_u8",
 #define KERNEL_SOURCE_4    "pre_process_nv12_scale_mix"
 
 typedef enum
@@ -85,7 +83,6 @@ static const struct {
     TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_1)
     TENSOR_PRE_PROCESS_NV12_KERNELS(U8, F16, SCALE,        KERNEL_SOURCE_2)
     TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I16, SCALE,        KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8,  TRANS,        KERNEL_SOURCE_3)
     TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, U8,  SCALE,     KERNEL_SOURCE_4)
     TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, F16, SCALE,     KERNEL_SOURCE_4)
 };
@@ -156,17 +153,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
     width      = out_shape->data[0];
     height     = out_shape->data[1];
 
-    if(reorder != 0)
+    if (reorder != 0)
     {
         reorder = 2;
         order1 = 0;
     }
 
-    if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
         dstScale = 1.0f / dstScale;
     }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
         if (attr[0]->dfp.fl > 0)
         {
@@ -178,7 +175,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
         }
         dstZP = 0;
     }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
     {
         dstScale = 1;
         dstZP = 0;
@@ -295,7 +292,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
     int32_t     dstZP      = 0;
     float       dstScale   = 1;
     int32_t     reorder    = 0;
-    int32_t     trans      = 0;
     int32_t     order1     = 2;
     uint32_t    width      = 0;
     uint32_t    height     = 0;
@@ -325,8 +321,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[1]->shape;
     dstZP      = attr[1]->asymm.zero_point;
@@ -334,24 +328,21 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
     width      = out_shape->data[0];
     height     = out_shape->data[1];
 
-    if(reorder != 0)
+    if (reorder != 0)
     {
         reorder = 2;
         order1 = 0;
     }
-    if(trans)
-    {
-        width = width / 3;
-    }
+
     resize = (float)width / attr[0]->shape->data[0];
     xrIntFloat_16 = (attr[0]->shape->data[0] << 16) / width + 1;
     yrIntFloat_16 = (attr[0]->shape->data[1] << 16) / height + 1;
 
-    if(attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
         dstScale = 1.0f / dstScale;
     }
-    else if(attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
         if (attr[1]->dfp.fl > 0)
         {
@@ -363,7 +354,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
         }
         dstZP = 0;
     }
-    else if(attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
+    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
     {
         dstScale = 1;
         dstZP = 0;
@@ -450,27 +441,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
         }, GPU_DP_TYPE_16 };
 
         //trans
-        gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{
-            0x11311311, // TCfg
-            0x00100100, // ASelt
-            0x01000400, 0x06020105, // ABin
-            0x22022022, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000000, 0x00000001,
-            0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{
-            0x00003113, // TCfg
-            0x00001001, // ASelt
-            0x03070302, 0x00000000, // ABin
-            0x00000220, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000001, 0x00000001, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
         gpu_dp_inst_t uniCalculateYShift_2x8 = {{
             0x00009999, // TCfg
             0x00000000, // ASelt
@@ -502,23 +472,15 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
-        if(resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16) && !trans)
+
+        if (resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16))
         {
             status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8);
             status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8);
         }
         CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-        if(trans && attr[1]->dtype == U8)
-        {
-            status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8);
-            CHECK_STATUS_FAIL_GOTO(status, OnError );
-        }
-        else
-        {
-            status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
-        }
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
         status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
 
@@ -572,20 +534,15 @@ static vsi_status _query_kernel
     uint32_t key = 0;
     int i = 0;
     vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
-    vsi_bool enable_perm  = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
     uint32_t srcWidth = inputs[0]->attr.size[0];
-    uint32_t dstWidth = enable_perm ? outputs[0]->attr.size[1] : outputs[0]->attr.size[0];
+    uint32_t dstWidth = outputs[0]->attr.size[0];
     float scaleVal = (float)dstWidth / srcWidth;
     uint32_t optFlg = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if(enable_perm)
-    {
-        convert_type = TRANS;
-    }
-    else if(enable_copy && output_dtype == U8)
+    if (enable_copy && output_dtype == U8)
     {
         convert_type = COPY;
     }
@@ -594,7 +551,7 @@ static vsi_status _query_kernel
         convert_type = SCALE;
     }
 
-    if(scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE)
+    if (scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE)
     {
         optFlg = 1;
     }
@@ -608,7 +565,7 @@ static vsi_status _query_kernel
             break;
         }
     }
-    if( i < _cnt_of_array(pre_process_nv12_map) )
+    if ( i < _cnt_of_array(pre_process_nv12_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_nv12_map[i].function_name );
         kernel->info.parameters = vxPreProcessNv12Kernel_param_def;
@@ -646,21 +603,20 @@ static vsi_nn_kernel_node_t _setup
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_NV12_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
-    int32_t shapes[VSI_NN_MAX_DIM_NUM]  = {1, 1, 1, 1};
     vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
-    int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
+    int32_t trans = 0;
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
     status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 3;
             int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@@ -674,22 +630,9 @@ static vsi_nn_kernel_node_t _setup
             int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
 
             /* Pass parameters to node. */
-            if(trans)
-            {
-                shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
-                shapes[1] = outputs[0]->attr.size[2];
-
-                reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
-                    outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
-
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM,
-                    inputs, 2, &reshape_tensors[0], 1 );
-            }
-            else
-            {
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM,
+            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM,
                     inputs, 2, outputs, 1 );
-            }
+
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
index c5ea1c5..09f55a6 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
@@ -90,14 +90,6 @@ static const struct {
     TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, COPY,         KERNEL_SOURCE_2)
     TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_2)
     TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8,  COPY,         KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, SCALE_NHWC,   KERNEL_SOURCE_3)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, SCALE_NHWC,   KERNEL_SOURCE_3)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8,  SCALE_NHWC,   KERNEL_SOURCE_3)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8,  SCALE_NHWC,   KERNEL_SOURCE_3)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, COPY_NHWC,    KERNEL_SOURCE_4)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, COPY_NHWC,    KERNEL_SOURCE_4)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8,  COPY_NHWC,    KERNEL_SOURCE_4)
-    TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8,  COPY_NHWC,    KERNEL_SOURCE_4)
 };
 
 static vx_param_description_t vxPreProcessRgbKernel_param_def[] =
@@ -156,8 +148,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
     outputZP   = (float)attr[0]->asymm.zero_point;
@@ -165,14 +155,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
     width      = out_shape->data[0];
     height     = out_shape->data[1];
 
-    if(reorder != 0)
+    if (reorder != 0)
     {
         reorder = 2;
         order1 = 0;
     }
     enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));
 
-    if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
         if (attr[0]->dfp.fl > 0)
         {
@@ -184,11 +174,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
         }
         outputZP = 0;
     }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
         outputScale = 1.0f / outputScale;
     }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
     {
         outputScale = 1;
         outputZP = 0;
@@ -199,48 +189,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
 
     pack_key = _PACK_SELECT_KEY( enable_copy, reorder, trans);
     {
-        // trans and copy
-        gpu_dp_inst_t uniNormilizationLo_2x8 = {{
-            0x99999999, // TCfg
-            0x44444444, // ASelt
-            0x45002142, 0x27480324, // ABin
-            0x99999999, // BSelt
-            0x06060606, 0x06060606, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniNormilizationHi_2x8 = {{
-            0x09999999, // TCfg
-            0x04444444, // ASelt
-            0x092a4b06, 0x000c2d4e, // ABin
-            0x09999999, // BSelt
-            0x06060606, 0x00060606, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
-            0x3c000000, 0x3c000000, 0x3c000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniNormilizationLo_NHWC_2x8 = {{
-            0x99999999, // TCfg
-            0x44444444, // ASelt
-            0x03422100, 0x27064524, // ABin
-            0x99999999, // BSelt
-            0x06060606, 0x06060606, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniNormilizationHi_NHWC_2x8 = {{
-            0x09999999, // TCfg
-            0x04444444, // ASelt
-            0x4b2a0948, 0x004e2d0c, // ABin
-            0x09999999, // BSelt
-            0x06060606, 0x00060606, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
-            0x3c000000, 0x3c000000, 0x3c000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
         // copy
         gpu_dp_inst_t uniExtractRtoF32_part0_4x4 = {{
             0x01010101, // TCfg
@@ -404,79 +352,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
             0x00000001, 0x00000000, 0x00000001, 0x00000000,
             0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniRePackRGBLo_2x8 = {{
-            0x00111111, // TCfg
-            0x00001001, // ASelt
-            0x01000400, 0x00000105, // ABin
-            0x00222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001,
-            0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniRePackRGBHi_2x8 = {{
-            0x00111111, // TCfg
-            0x00001001, // ASelt
-            0x03020602, 0x00000307, // ABin
-            0x00222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001,
-            0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniRePackRGBLo_NHWC_2x8 = {{
-            0x00111111, // TCfg
-            0x00100100, // ASelt
-            0x01000400, 0x00000105, // ABin
-            0x00222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001,
-            0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniRePackRGBHi_NHWC_2x8 = {{
-            0x00111111, // TCfg
-            0x00100100, // ASelt
-            0x03020602, 0x00000307, // ABin
-            0x00222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001,
-            0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
 
-        switch( pack_key )
+        switch ( pack_key )
         {
-        case _PACK_SELECT_KEY( 1, 0, 1):  // copy         trans
-            {
-                shaderParam.global_scale[0]  = 15;
-                shaderParam.global_scale[1]  = 1;
-                shaderParam.global_scale[2]  = 1;
-                shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
-                    / shaderParam.global_scale[0], 4);
-                shaderParam.global_size[1]   = height;
-                shaderParam.global_size[2]   = 1;
-
-                status = vsi_nn_kernel_gpu_add_param(node, "uniNormilizationLo_2x8", &uniNormilizationLo_NHWC_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniNormilizationHi_2x8", &uniNormilizationHi_NHWC_2x8);
-                CHECK_STATUS_FAIL_GOTO(status, OnError);
-            }
-            break;
-        case _PACK_SELECT_KEY( 1, 2, 1):  // copy reorder  trans
-            {
-                shaderParam.global_scale[0]  = 15;
-                shaderParam.global_scale[1]  = 1;
-                shaderParam.global_scale[2]  = 1;
-                shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
-                    / shaderParam.global_scale[0], 4);
-                shaderParam.global_size[1]   = height;
-                shaderParam.global_size[2]   = 1;
-
-                status = vsi_nn_kernel_gpu_add_param(node, "uniNormilizationLo_2x8", &uniNormilizationLo_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniNormilizationHi_2x8", &uniNormilizationHi_2x8);
-                CHECK_STATUS_FAIL_GOTO(status, OnError);
-            }
-            break;
         case _PACK_SELECT_KEY( 1, 0, 0):  // copy
         case _PACK_SELECT_KEY( 1, 2, 0):  // copy  reorder
             {
@@ -539,68 +417,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
                 CHECK_STATUS_FAIL_GOTO(status, OnError);
             }
             break;
-        case _PACK_SELECT_KEY( 0, 0, 1):  //      trans
-            {
-                shaderParam.global_scale[0]  = 4;
-                shaderParam.global_scale[1]  = 1;
-                shaderParam.global_scale[2]  = 1;
-                shaderParam.global_size[0]   = gpu_align_p2((width / 3 + shaderParam.global_scale[0] - 1)
-                    / shaderParam.global_scale[0], 4);
-                shaderParam.global_size[1]   = height;
-                shaderParam.global_size[2]   = 1;
-
-                if(attr[0]->dtype == F16)
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
-                }
-                else
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
-                }
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToR", &uniUnpackToR);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToG", &uniUnpackToG);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToB", &uniUnpackToB);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBLo_2x8", &uniRePackRGBLo_NHWC_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBHi_2x8", &uniRePackRGBHi_NHWC_2x8);
-                CHECK_STATUS_FAIL_GOTO(status, OnError);
-            }
-            break;
-        case _PACK_SELECT_KEY( 0, 2, 1):  //    reorder  trans
-            {
-                shaderParam.global_scale[0]  = 4;
-                shaderParam.global_scale[1]  = 1;
-                shaderParam.global_scale[2]  = 1;
-                shaderParam.global_size[0]   = gpu_align_p2((width / 3 + shaderParam.global_scale[0] - 1)
-                    / shaderParam.global_scale[0], 4);
-                shaderParam.global_size[1]   = height;
-                shaderParam.global_size[2]   = 1;
-
-                if(attr[0]->dtype == F16)
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
-                }
-                else
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
-                }
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToR", &uniUnpackToR);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToG", &uniUnpackToG);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToB", &uniUnpackToB);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBLo_2x8", &uniRePackRGBLo_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBHi_2x8", &uniRePackRGBHi_2x8);
-                CHECK_STATUS_FAIL_GOTO(status, OnError);
-            }
-            break;
         default:
             break;
         }
@@ -637,23 +453,14 @@ static vsi_status _query_kernel
     uint32_t key = 0;
     int i = 0;
     vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
-    vsi_bool enable_perm  = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if(enable_copy && enable_perm)
-    {
-        convert_type = COPY_NHWC;
-    }
-    else if(enable_copy)
+    if (enable_copy)
     {
         convert_type = COPY;
     }
-    else if(enable_perm)
-    {
-        convert_type = SCALE_NHWC;
-    }
     else
     {
         convert_type = SCALE;
@@ -661,14 +468,14 @@ static vsi_status _query_kernel
 
     key = HASH_PRE_PROCESS_RGB_KEY( input0_dtype, output_dtype, convert_type, 0 );
 
-    for( i = 0; i < _cnt_of_array(pre_process_rgb_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(pre_process_rgb_map); i ++ )
     {
         if( pre_process_rgb_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(pre_process_rgb_map) )
+    if ( i < _cnt_of_array(pre_process_rgb_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_rgb_map[i].function_name );
         kernel->info.parameters = vxPreProcessRgbKernel_param_def;
@@ -698,21 +505,20 @@ static vsi_nn_kernel_node_t _setup
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_RGB_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
-    int32_t shapes[VSI_NN_MAX_DIM_NUM]  = {1, 1, 1, 1};
     vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
-    int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
+    int32_t trans = 0;
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
     status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 2;
             int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@@ -726,18 +532,7 @@ static vsi_nn_kernel_node_t _setup
             int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
 
             /* Pass parameters to node. */
-            if(trans)
-            {
-                shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
-                shapes[1] = outputs[0]->attr.size[2];
-
-                reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
-                    outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
-
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM,
-                    inputs, 1, &reshape_tensors[0], 1 );
-            }
-            else
+            if (trans == 0)
             {
                 vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM,
                     inputs, 1, outputs, 1 );
@@ -767,7 +562,7 @@ static vsi_nn_kernel_node_t _setup
         }
     }
 
-    if(reshape_tensors[0])
+    if (reshape_tensors[0])
     {
         vsi_nn_ReleaseTensor(&reshape_tensors[0]);
     }
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
index b7617ae..2d32371 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
@@ -43,15 +43,12 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8      CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8")
-#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_trans_U8")
-#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_TRANS_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_trans_U8toU8")
 
 #define KERNEL_SOURCE_1    "pre_process_yuv420_scale_u8",
 #define KERNEL_SOURCE_2    "pre_process_yuv420_copy_u8",
 #define KERNEL_SOURCE_3    "pre_process_yuv420_scale_fp16",
 #define KERNEL_SOURCE_4    "pre_process_yuv420_scale_i16",
 #define KERNEL_SOURCE_5    "pre_process_yuv420_scale_i8",
-#define KERNEL_SOURCE_6    "pre_process_yuv420_trans_u8"
 
 typedef enum
 {
@@ -80,8 +77,6 @@ static const struct {
     TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_1)
     TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_5)
     TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  COPY_TRANS,   KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  TRANS,        KERNEL_SOURCE_6)
 };
 
 static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] =
@@ -143,24 +138,24 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
     width      = out_shape->data[0];
     height     = out_shape->data[1];
 
-    if(reorder != 0)
+    if (reorder != 0)
     {
         reorder = 2;
         order1 = 0;
     }
 
-    if(trans)
+    if (trans)
     {
         width = width / 3;
     }
 
-    if(attr[0]->dtype == U8)
+    if (attr[0]->dtype == U8)
     {
         dstScale = 1.0f / dstScale;
     }
 
     shaderParam.global_scale[0]  = 16;
-    if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
     {
         shaderParam.global_scale[0]  = 8;
     }
@@ -176,131 +171,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
     {
-        gpu_dp_inst_t uniPackBG0_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x01000000, 0x02020001, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackTmpAndR_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03000100, 0x07060104, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackRB0_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x03000302, 0x05040004, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp0AndG_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03030100, 0x07060404, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackGR1_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x06000505, 0x07070006, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp1AndB_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03060100, 0x07060704, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackBG1_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x09000808, 0x0a0a0009, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp1AndR_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03080100, 0x07060904, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackRB2_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x0b000b0a, 0x0d0c000c, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp2AndG_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x030b0100, 0x07060c04, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackGR2_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x0e000d0d, 0x0f0f000e, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp2AndB_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x030e0100, 0x07060f04, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
         gpu_dp_inst_t uniCalculateTmpR1st_4x4 = {{
                 0x05050505, // TCfg
                 0x04040404, // ASelt
@@ -574,19 +444,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB4th_4x4", &uniCalculateTmpB4th_4x4);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4);
 
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG0_2x8", &uniPackBG0_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmpAndR_2x8", &uniPackTmpAndR_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB0_2x8", &uniPackRB0_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp0AndG_2x8", &uniPackTmp0AndG_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR1_2x8", &uniPackGR1_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndB_2x8", &uniPackTmp1AndB_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG1_2x8", &uniPackBG1_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndR_2x8", &uniPackTmp1AndR_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB2_2x8", &uniPackRB2_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndG_2x8", &uniPackTmp2AndG_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR2_2x8", &uniPackGR2_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndB_2x8", &uniPackTmp2AndB_2x8);
-
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoB_2x8", &uniQuantU8toU8LoB_2x8);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiB_2x8", &uniQuantU8toU8HiB_2x8);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoG_2x8", &uniQuantU8toU8LoG_2x8);
@@ -633,7 +490,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
     int32_t     dstZP      = 0;
     float       dstScale   = 1;
     int32_t     reorder    = 0;
-    int32_t     trans      = 0;
     int32_t     order1     = 2;
     uint32_t    width      = 0;
     uint32_t    height     = 0;
@@ -646,8 +502,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
 
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
     dstZP      = attr[0]->asymm.zero_point;
@@ -655,17 +509,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
     width      = out_shape->data[0];
     height     = out_shape->data[1];
 
-    if(reorder != 0)
+    if (reorder != 0)
     {
         reorder = 2;
         order1 = 0;
     }
-    if(trans)
-    {
-        width = width / 3;
-    }
 
-    if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
         if (attr[0]->dfp.fl > 0)
         {
@@ -677,11 +527,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
         }
         dstZP = 0;
     }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
-        dstScale = 1.0f/dstScale;
+        dstScale = 1.0f / dstScale;
     }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
     {
         dstScale = 1;
         dstZP = 0;
@@ -925,26 +775,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
             0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
         }, GPU_DP_TYPE_16 };
 
-        //trans
-        gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{
-            0x11311311, // TCfg
-            0x00100100, // ASelt
-            0x01000400, 0x06020105, // ABin
-            0x22022022, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{
-            0x00003113, // TCfg
-            0x00001001, // ASelt
-            0x03070302, 0x00000000, // ABin
-            0x00000220, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
         status = vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU2nd_2x8", &uniCalculateTmpGbyU2nd_2x8);
@@ -975,16 +805,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise_4x4", &uniCalculateGWise_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise2nd_4x4", &uniCalculateGWise2nd_4x4);
 
-        if(trans)
-        {
-            status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8);
-            CHECK_STATUS_FAIL_GOTO(status, OnError );
-        }
-        else
-        {
-            status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
-        }
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
         status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
 
@@ -1041,20 +862,11 @@ static vsi_status _query_kernel
     uint32_t key = 0;
     int i = 0;
     vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
-    vsi_bool enable_perm  = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if(enable_perm && enable_copy)
-    {
-        convert_type = COPY_TRANS;
-    }
-    else if(enable_perm)
-    {
-        convert_type = TRANS;
-    }
-    else if(enable_copy && output_dtype == U8)
+    if (enable_copy && output_dtype == U8)
     {
         convert_type = COPY;
     }
@@ -1065,20 +877,20 @@ static vsi_status _query_kernel
 
     key = HASH_PRE_PROCESS_YUV420_KEY( input0_dtype, output_dtype, convert_type, 0 );
 
-    for( i = 0; i < _cnt_of_array(pre_process_yuv420_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(pre_process_yuv420_map); i ++ )
     {
-        if( pre_process_yuv420_map[i].key == key )
+        if ( pre_process_yuv420_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(pre_process_yuv420_map) )
+    if ( i < _cnt_of_array(pre_process_yuv420_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_yuv420_map[i].function_name );
         kernel->info.parameters = vxPreProcessYuv420Kernel_param_def;
         kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def );
 
-        if(enable_copy && output_dtype == U8)
+        if (enable_copy && output_dtype == U8)
         {
             kernel->info.initialize = _pre_process_yuv420_copy_initializer;
         }
@@ -1110,21 +922,20 @@ static vsi_nn_kernel_node_t _setup
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV420_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
-    int32_t shapes[VSI_NN_MAX_DIM_NUM]  = {1, 1, 1, 1};
     vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
-    int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
+    int32_t trans = 0;
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
     status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 4;
             int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@@ -1138,22 +949,10 @@ static vsi_nn_kernel_node_t _setup
             int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
 
             /* Pass parameters to node. */
-            if(trans)
-            {
-                shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
-                shapes[1] = outputs[0]->attr.size[2];
 
-                reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
-                    outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
+            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM,
+                inputs, 3, outputs, 1 );
 
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM,
-                    inputs, 3, &reshape_tensors[0], 1 );
-            }
-            else
-            {
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM,
-                    inputs, 3, outputs, 1 );
-            }
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
@@ -1178,7 +977,7 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &tmp_params[13] );
         }
     }
-    if(reshape_tensors[0])
+    if (reshape_tensors[0])
     {
         vsi_nn_ReleaseTensor(&reshape_tensors[0]);
     }
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
index adb16cb..7d51d43 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
@@ -43,11 +43,8 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toU8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOI8     CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toI8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOU8      CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toU8")
-#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_trans_U8")
-#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_TRANS_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv444_trans_U8toU8")
 
 #define KERNEL_SOURCE_1    "pre_process_yuv444_scale",
-#define KERNEL_SOURCE_2    "pre_process_yuv444_trans_u8",
 #define KERNEL_SOURCE_3    "pre_process_yuv444_scale_fp16",
 #define KERNEL_SOURCE_4    "pre_process_yuv444_copy_u8",
 
@@ -78,8 +75,6 @@ static const struct {
     TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_1)
     TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_1)
     TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_4)
-    TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8,  COPY_TRANS,   KERNEL_SOURCE_4)
-    TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8,  TRANS,        KERNEL_SOURCE_2)
 };
 
 static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] =
@@ -119,7 +114,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
     int32_t     dstZP      = 0;
     float       dstScale   = 1;
     int32_t     reorder    = 0;
-    int32_t     trans      = 0;
     int32_t     order1     = 2;
     uint32_t    width      = 0;
     uint32_t    height     = 0;
@@ -132,8 +126,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
 
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
     dstZP      = attr[0]->asymm.zero_point;
@@ -141,24 +133,19 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
     width      = out_shape->data[0];
     height     = out_shape->data[1];
 
-    if(reorder != 0)
+    if (reorder != 0)
     {
         reorder = 2;
         order1 = 0;
     }
 
-    if(trans)
-    {
-        width = width / 3;
-    }
-
-    if(attr[0]->dtype == U8)
+    if (attr[0]->dtype == U8)
     {
         dstScale = 1.0f / dstScale;
     }
 
     shaderParam.global_scale[0]  = 16;
-    if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
     {
         shaderParam.global_scale[0]  = 8;
     }
@@ -174,131 +161,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
     {
-        gpu_dp_inst_t uniPackBG0_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x01000000, 0x02020001, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackTmpAndR_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03000100, 0x07060104, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackRB0_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x03000302, 0x05040004, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp0AndG_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03030100, 0x07060404, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackGR1_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x06000505, 0x07070006, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp1AndB_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03060100, 0x07060704, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackBG1_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x09000808, 0x0a0a0009, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp1AndR_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x03080100, 0x07060904, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniPackRB2_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x0b000b0a, 0x0d0c000c, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp2AndG_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x030b0100, 0x07060c04, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackGR2_2x8 = {{
-                0x11011011, // TCfg
-                0x10010010, // ASelt
-                0x0e000d0d, 0x0f0f000e, // ABin
-                0x22022022, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000000, 0x00000001,
-                0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniPackTmp2AndB_2x8 = {{
-                0x11111111, // TCfg
-                0x00100100, // ASelt
-                0x030e0100, 0x07060f04, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
         gpu_dp_inst_t uniCalculateTmpR1st_4x4 = {{
                 0x05050505, // TCfg
                 0x04040404, // ASelt
@@ -563,19 +425,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB4th_4x4", &uniCalculateTmpB4th_4x4);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4);
 
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG0_2x8", &uniPackBG0_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmpAndR_2x8", &uniPackTmpAndR_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB0_2x8", &uniPackRB0_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp0AndG_2x8", &uniPackTmp0AndG_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR1_2x8", &uniPackGR1_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndB_2x8", &uniPackTmp1AndB_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG1_2x8", &uniPackBG1_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndR_2x8", &uniPackTmp1AndR_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB2_2x8", &uniPackRB2_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndG_2x8", &uniPackTmp2AndG_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR2_2x8", &uniPackGR2_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndB_2x8", &uniPackTmp2AndB_2x8);
-
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoB_2x8", &uniQuantU8toU8LoB_2x8);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiB_2x8", &uniQuantU8toU8HiB_2x8);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoG_2x8", &uniQuantU8toU8LoG_2x8);
@@ -622,7 +471,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
     int32_t     dstZP      = 0;
     float       dstScale   = 1;
     int32_t     reorder    = 0;
-    int32_t     trans      = 0;
     int32_t     order1     = 2;
     uint32_t    width      = 0;
     uint32_t    height     = 0;
@@ -635,8 +483,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
 
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
     dstZP      = attr[0]->asymm.zero_point;
@@ -644,17 +490,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
     width      = out_shape->data[0];
     height     = out_shape->data[1];
 
-    if(reorder != 0)
+    if (reorder != 0)
     {
         reorder = 2;
         order1 = 0;
     }
-    if(trans)
-    {
-        width = width / 3;
-    }
 
-    if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
         if (attr[0]->dfp.fl > 0)
         {
@@ -666,11 +508,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
         }
         dstZP = 0;
     }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
-        dstScale = 1.0f/dstScale;
+        dstScale = 1.0f / dstScale;
     }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
     {
         dstScale = 1;
         dstZP = 0;
@@ -914,26 +756,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
             0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
         }, GPU_DP_TYPE_16 };
 
-        //trans
-        gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{
-            0x11311311, // TCfg
-            0x00100100, // ASelt
-            0x01000400, 0x06020105, // ABin
-            0x22022022, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{
-            0x00003113, // TCfg
-            0x00001001, // ASelt
-            0x03070302, 0x00000000, // ABin
-            0x00000220, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
         status = vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU2nd_2x8", &uniCalculateTmpGbyU2nd_2x8);
@@ -963,17 +785,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
 
         status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise_4x4", &uniCalculateGWise_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise2nd_4x4", &uniCalculateGWise2nd_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
 
-        if(trans)
-        {
-            status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8);
-            CHECK_STATUS_FAIL_GOTO(status, OnError );
-        }
-        else
-        {
-            status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
-        }
         status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
 
@@ -1024,20 +837,11 @@ static vsi_status _query_kernel
     uint32_t key = 0;
     int i = 0;
     vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
-    vsi_bool enable_perm  = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if(enable_perm && enable_copy)
-    {
-        convert_type = COPY_TRANS;
-    }
-    else if(enable_perm)
-    {
-        convert_type = TRANS;
-    }
-    else if(enable_copy && output_dtype == U8)
+    if (enable_copy && output_dtype == U8)
     {
         convert_type = COPY;
     }
@@ -1048,20 +852,20 @@ static vsi_status _query_kernel
 
     key = HASH_PRE_PROCESS_YUV444_KEY( input0_dtype, output_dtype, convert_type, 0 );
 
-    for( i = 0; i < _cnt_of_array(pre_process_yuv444_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(pre_process_yuv444_map); i ++ )
     {
-        if( pre_process_yuv444_map[i].key == key )
+        if ( pre_process_yuv444_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(pre_process_yuv444_map) )
+    if ( i < _cnt_of_array(pre_process_yuv444_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_yuv444_map[i].function_name );
         kernel->info.parameters = vxPreProcessYuv444Kernel_param_def;
         kernel->info.numParams = _cnt_of_array( vxPreProcessYuv444Kernel_param_def );
 
-        if(enable_copy && output_dtype == U8)
+        if (enable_copy && output_dtype == U8)
         {
             kernel->info.initialize = _pre_process_yuv444_copy_initializer;
         }
@@ -1093,21 +897,20 @@ static vsi_nn_kernel_node_t _setup
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV444_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
-    int32_t shapes[VSI_NN_MAX_DIM_NUM]  = {1, 1, 1, 1};
     vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
-    int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
+    int32_t trans = 0;
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
     status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 4;
             int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@@ -1121,22 +924,9 @@ static vsi_nn_kernel_node_t _setup
             int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
 
             /* Pass parameters to node. */
-            if(trans)
-            {
-                shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
-                shapes[1] = outputs[0]->attr.size[2];
+            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM,
+                inputs, 3, outputs, 1 );
 
-                reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
-                    outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
-
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM,
-                    inputs, 3, &reshape_tensors[0], 1 );
-            }
-            else
-            {
-                vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM,
-                    inputs, 3, outputs, 1 );
-            }
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
index 0cc7c61..af3e06f 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@@ -369,6 +369,26 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
             0x00000000, 0x00000000, 0x00000000, 0x00000000,
             0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniRightSubLeft_4x4 = {{
+            0x09090909, // TCfg
+            0x00000000, // ASelt
+            0x00230001, 0x00670045, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniDFPtoFp32_left_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
 
         if (I8 == input_dtype && I8 == output_dtype && out_width > in_width)
         {
@@ -405,7 +425,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
 
             status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
             status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4",
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
                                                  &uniConvertDFP2FP32_part1_4x4);
             status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
             CHECK_STATUS_FAIL_GOTO(status, final );
@@ -447,16 +468,22 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
 
             status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
             status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4",
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
                                                  &uniConvertDFP2FP32_part1_4x4);
             status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
             CHECK_STATUS_FAIL_GOTO(status, final );
 
             gpu_param.global_scale[2] = depth;
         }
+        else
+        {
+            status  = vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniDFPtoFp32_left_4x4);
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
 
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_4x4", &uniConvertDFP2FP32_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
         status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
         status |= vsi_nn_kernel_gpu_add_param( node, "dfpScale", &dfpScale);
         CHECK_STATUS_FAIL_GOTO(status, final );
@@ -485,10 +512,33 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
             0x00010001, 0x00000000, 0x00010001, 0x00000000,
             0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniU8SubZPtoFp32_left_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{
+            0x09090909, // TCfg
+            0x00000000, // ASelt
+            0x00230001, 0x00670045, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
 
         if (F16 == output_dtype)
         {
-            status = vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &input_scale);
+            status  = vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &input_scale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4);
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         else
@@ -544,13 +594,21 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
                 }
                 else
                 {
+                    status  = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_4x4);
                     status |= vsi_nn_kernel_gpu_add_param( node,
-                              "uniU8SubZPtoFp32_part1_4x4", &uniU8SubZPtoFp32_part1_4x4);
+                              "uniU8RightSubLeft_4x4", &uniU8SubZPtoFp32_part1_4x4);
                 }
                 CHECK_STATUS_FAIL_GOTO(status, final );
 
                 gpu_param.global_scale[2] = depth;
             }
+            else if (!is_use_scale_kernel)
+            {
+                status = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4);
+                status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4);
+                CHECK_STATUS_FAIL_GOTO(status, final );
+            }
+
             if (!is_use_scale_kernel)
             {
                 status  = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
@@ -562,8 +620,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         status  = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
         if (!is_use_scale_kernel)
         {
-            status  = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_4x4", &uniU8SubZPtoFp32_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "input_ZP", &inputZP);
+            status = vsi_nn_kernel_gpu_add_param( node, "input_ZP", &inputZP);
         }
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
@@ -581,25 +638,25 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
             0x00000000, 0x00000000, 0x00000000, 0x00000000,
             0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniFp16toFp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
         gpu_dp_inst_t uniRightSubLeft_4x4 = {{
             0x09090909, // TCfg
-            0x04040404, // ASelt
-            0x00110000, 0x00330022, // ABin
+            0x00000000, // ASelt
+            0x00230001, 0x00670045, // ABin
             0x0a0a0a0a, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000,
-            0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniFp16toFp32_left_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16};
         gpu_dp_inst_t uniExtactHalf8_2x8 = {{
             0x11111111, // TCfg
@@ -634,7 +691,17 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
             }, GPU_DP_TYPE_16};
-            gpu_dp_inst_t uniFp16toFp32_part1_4x4 = {{
+            gpu_dp_inst_t uniFp16toFp32_Lo_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniFp16toFp32_Hi_4x4 = {{
                 0x09090909, // TCfg
                 0x00000000, // ASelt
                 0x00150004, 0x00370026, // ABin
@@ -647,7 +714,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
 
             status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
             status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_Lo_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniFp16toFp32_Hi_4x4);
             status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8);
             status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
             CHECK_STATUS_FAIL_GOTO(status, final );
@@ -657,19 +725,21 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         else if (F16 == output_dtype)
         {
             status = vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_left_4x4);
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         else
         {
             status  = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_left_4x4);
             status |= vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &uint8Scale);
             status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &uint8ZP_out);
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
 
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
+        status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
     else if (BF16 == input_dtype && BF16 == output_dtype)
diff --git a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
new file mode 100644
index 0000000..cc4c5f6
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
@@ -0,0 +1,366 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_U8TOU8        CVIVANTE_NAMESPACE("evis.space2depth_internal_U8toU8")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_U8TOU8_X2Y1   CVIVANTE_NAMESPACE("evis.space2depth_internal_U8toU8_X2Y1")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I8TOI8        CVIVANTE_NAMESPACE("evis.space2depth_internal_I8toI8")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I8TOI8_X2Y1   CVIVANTE_NAMESPACE("evis.space2depth_internal_I8toI8_X2Y1")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I16TOI16      CVIVANTE_NAMESPACE("evis.space2depth_internal_I16toI16")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I16TOI16_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_I16toI16_X2Y1")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_F16TOF16      CVIVANTE_NAMESPACE("evis.space2depth_internal_F16toF16")
+#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_F16TOF16_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_F16toF16_X2Y1")
+
+#define KERNEL_SOURCE_1    "space2depth_internal"
+
+// Add kernel hashtable here
+#define HASH_SPACE2DEPTH_INTERNAL_KEY(_input0_type, _output_type, _opt_stride) \
+    ((_input0_type << 24) | (_output_type << 16) | (_opt_stride << 8))
+
+#define TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
+#define TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_##IN0_TYPE##TO##OUT_TYPE##_X2Y1, \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } space2depth_internal_map[] =
+{
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(U8,  U8,        KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(I8,  I8,        KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(I16, I16,       KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(F16, F16,       KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(U8,  U8,    KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(I8,  I8,    KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(I16, I16,   KERNEL_SOURCE_1)
+    TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(F16, F16,   KERNEL_SOURCE_1)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _space2depth_internal_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _SPACE2DEPTH_INTERNAL_PARAM_NUM  _cnt_of_array( _space2depth_internal_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    uint32_t    input_dims = 0;
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    int32_t     input_width  = 0;
+    int32_t     input_height = 0;
+    int32_t     input_depth  = 0;
+    int32_t     stride_x   = 0;
+    int32_t     stride_y   = 0;
+    int32_t     opt_flg    = 0;
+
+    uint32_t pack_key = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &stride_x);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &stride_y);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    input_dims = (uint32_t)attr[0]->shape->size;
+    input_width = attr[0]->shape->data[0];
+    input_height = attr[0]->shape->data[1];
+    input_depth = input_dims > 2 ? attr[0]->shape->data[2] : 1;
+
+    shaderParam.global_scale[0]  = 1;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    if (stride_x == 2 && stride_y == 1)
+    {
+        shaderParam.global_scale[0]  = 16;
+        if (attr[0]->dtype == F16 || attr[0]->dtype == I16)
+        {
+            shaderParam.global_scale[0]  = 8;
+        }
+        opt_flg = 1;
+    }
+    shaderParam.global_size[0]   = gpu_align_p2((input_width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = input_height;
+    shaderParam.global_size[2]   = input_depth;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, OPT_FLG )    \
+        (IN0_TYPE | (OUT_TYPE << 8) | (OPT_FLG << 16))
+
+    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, opt_flg);
+
+    {
+        gpu_dp_inst_t uniExtractEvenUint8Stride2_2x8 = {{
+            0x11111111, // TCfg
+            0x00000000, // ASelt
+            0x06040200, 0x0e0c0a08, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000700, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddUint8Stride2_2x8 = {{
+            0x11111111, // TCfg
+            0x00000000, // ASelt
+            0x07050301, 0x0f0d0b09, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000700, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniExtractEvenFp16Stride2_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddFp16Stride2_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00030001, 0x00070005, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param(node, "input_depth", &input_depth);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        switch( pack_key )
+        {
+        case _PACK_SELECT_KEY( U8,  U8,  0 ):
+        case _PACK_SELECT_KEY( I8,  I8,  0 ):
+        case _PACK_SELECT_KEY( I16, I16, 0 ):
+        case _PACK_SELECT_KEY( F16, F16, 0 ):
+            break;
+        case _PACK_SELECT_KEY( U8,  U8,  1 ):
+        case _PACK_SELECT_KEY( I8,  I8,  1 ):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractEvenUint8Stride2_2x8", &uniExtractEvenUint8Stride2_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractOddUint8Stride2_2x8", &uniExtractOddUint8Stride2_2x8 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( I16, I16, 1 ):
+        case _PACK_SELECT_KEY( F16, F16, 1 ):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractEvenFp16Stride2_4x4", &uniExtractEvenFp16Stride2_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractOddFp16Stride2_4x4", &uniExtractOddFp16Stride2_4x4 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+#undef _PACK_SELECT_KEY
+
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+}
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    const vsi_nn_kernel_param_t * params,
+    int32_t opt_flg
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_SPACE2DEPTH_INTERNAL_KEY( input0_dtype, output_dtype, opt_flg );
+
+    for( i = 0; i < _cnt_of_array(space2depth_internal_map); i ++ )
+    {
+        if ( space2depth_internal_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(space2depth_internal_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  space2depth_internal_map[i].function_name );
+        kernel->info.parameters = _space2depth_internal_kernel_param_def;
+        kernel->info.numParams = _SPACE2DEPTH_INTERNAL_PARAM_NUM;
+        kernel->info.initialize = _space2depth_internal_initializer;
+
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                space2depth_internal_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                space2depth_internal_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t tmp_params[_SPACE2DEPTH_INTERNAL_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t block_size_x  = vsi_nn_kernel_param_get_int32( params, "block_size_x" );
+    int32_t block_size_y  = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
+    int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, params, opt_flg );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            vsi_nn_kernel_node_pack_io( tmp_params, _SPACE2DEPTH_INTERNAL_PARAM_NUM, inputs, 1, outputs, 1 );
+            tmp_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_x );
+            tmp_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_y );
+            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _SPACE2DEPTH_INTERNAL_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &tmp_params[2] );
+            vsi_nn_kernel_scalar_release( &tmp_params[3] );
+            {
+                // Set default border mode.
+                vx_border_t border;
+                border.mode = VX_BORDER_CONSTANT;
+                border.constant_value.U8 = 0;
+                border.constant_value.U16 = 0;
+                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+                {
+                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                }
+                status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+                CHECK_STATUS(status);
+            }
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( space2depth_internal, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
new file mode 100644
index 0000000..5d89b18
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
@@ -0,0 +1,422 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    UP_ORG = 0,
+    UP_K2,
+} _internal_upscale_e;
+
+#define _UPSAMPLESCALE_KERNEL_SOURCE      "upsamplescale"
+#define _UPSAMPLESCALE_KERNEL_K2_SOURCE   "upsamplescale_k2"
+#define _UPSAMPLESCALE_KERNEL_NAME        CVIVANTE_NAMESPACE("evis.upsamplescale")
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, FLAG ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8) | ( FLAG << 16))
+
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_ORG ), \
+          CVIVANTE_NAMESPACE("evis.upsamplescale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          _UPSAMPLESCALE_KERNEL_SOURCE }
+
+#define PACK_KERNEL_MAP_K2( IN_DTYPE, OUT_DTYPE ) \
+        { UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_K2 ), \
+          CVIVANTE_NAMESPACE("evis.upsamplescale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_K2"), \
+          _UPSAMPLESCALE_KERNEL_K2_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _upsamplescale_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( F16, F16 ),
+    PACK_KERNEL_MAP( F16, I16 ),
+    PACK_KERNEL_MAP( F16, I8 ),
+    PACK_KERNEL_MAP( F16, U8 ),
+    PACK_KERNEL_MAP( I16, I16 ),
+    PACK_KERNEL_MAP( I16, F16 ),
+    PACK_KERNEL_MAP( I8,  I8 ),
+    PACK_KERNEL_MAP( I8,  F16 ),
+    PACK_KERNEL_MAP( U8,  U8 ),
+    PACK_KERNEL_MAP( U8,  F16 ),
+
+    PACK_KERNEL_MAP_K2( F16, F16 ),
+    PACK_KERNEL_MAP_K2( F16, I16 ),
+    PACK_KERNEL_MAP_K2( F16, I8 ),
+    PACK_KERNEL_MAP_K2( F16, U8 ),
+    PACK_KERNEL_MAP_K2( I16, I16 ),
+    PACK_KERNEL_MAP_K2( I16, F16 ),
+    PACK_KERNEL_MAP_K2( I8,  I8 ),
+    PACK_KERNEL_MAP_K2( I8,  F16 ),
+    PACK_KERNEL_MAP_K2( U8,  U8 ),
+    PACK_KERNEL_MAP_K2( U8,  F16 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _upsamplescale_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _UPSAMPLESCALE_PARAM_NUM  _cnt_of_array( _upsamplescale_kernel_param_def )
+#define SCALAR_STRIDE_VALUE          (2)
+#define SCALAR_SCALE_VALUE           (3)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_upsamplescale_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+#define _PACK_UPSCALE_KEY( IN_TYPE, OUT_TYPE, FLAG )    \
+        ( IN_TYPE  | ( OUT_TYPE << 16) | (FLAG << 24) )
+
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
+    vsi_int_array_t * in_shape                  = NULL;
+    vsi_nn_kernel_dtype_e        input_dtype    = F16;
+    vsi_nn_kernel_dtype_e        output_dtype   = F16;
+    int32_t   stride          = 0;
+    float     scale           = 0;
+    float     scaleIn         = 1.0f;
+    float     scaleOut        = 1.0f;
+    int32_t   output_ZP       = 0;
+    int32_t   input_ZP        = 0;
+    int32_t   srcFixPointPos  = 0;
+    int32_t   dstFixPointPos  = 0;
+    uint32_t  pack_key        = 0;
+    _internal_upscale_e flag  = UP_ORG;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    in_shape  = input_attr->shape;
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_VALUE], &(stride));
+    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_VALUE], &(scale));
+    input_dtype  = input_attr->dtype;
+    output_dtype = output_attr->dtype;
+
+    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
+    {
+        srcFixPointPos   = input_attr->dfp.fl;
+        if (srcFixPointPos >=0 )
+            scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
+        else
+            scaleIn = (float) ((int64_t)1 << -srcFixPointPos);
+    }
+    else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
+    {
+        input_ZP         = input_attr->asymm.zero_point;
+        scaleIn          = input_attr->asymm.scale;
+    }
+
+    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
+    {
+        dstFixPointPos   = output_attr->dfp.fl;
+        if (dstFixPointPos >=0 )
+            scaleOut = 1.0f / (float) ((int64_t)1 << dstFixPointPos);
+        else
+            scaleOut = (float) ((int64_t)1 << -dstFixPointPos);
+    }
+    else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
+    {
+        output_ZP        = output_attr->asymm.zero_point;
+        scaleOut         = output_attr->asymm.scale;
+    }
+
+    if (stride == 2 && scale >= 0)
+    {
+        flag = UP_K2;
+    }
+
+    if ( flag == UP_K2 )
+    {
+        gpu_param.global_scale[0] = 8;
+        gpu_param.global_scale[1] = 1;
+        gpu_param.global_scale[2] = 1;
+    }
+    else
+    {
+        gpu_param.global_scale[0] = 1;
+        gpu_param.global_scale[1] = 1;
+        gpu_param.global_scale[2] = 1;
+    }
+
+    gpu_param.global_size[0] = gpu_align_p2(
+            (in_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (in_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = in_shape->size > 2 ? in_shape->data[2] : 1;
+
+    pack_key = _PACK_UPSCALE_KEY( input_dtype, output_dtype, flag );
+
+    switch( pack_key )
+    {
+        case _PACK_UPSCALE_KEY( F16, F16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( F16, I16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( F16, I8,  UP_K2 ):
+        case _PACK_UPSCALE_KEY( F16, U8,  UP_K2 ):
+        case _PACK_UPSCALE_KEY( I16, F16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( I16, I16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( I8,  F16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( I8,  I8,  UP_K2 ):
+        case _PACK_UPSCALE_KEY( U8,  F16, UP_K2 ):
+        case _PACK_UPSCALE_KEY( U8,  U8,  UP_K2 ):
+        {
+            uint16_t multiplier         = 0;
+            int32_t  postShift          = 0;
+            uint32_t multAndoutZP[2]    = {0};
+            gpu_dp_inst_t uniUpSampleScale2X_lo_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x11111010, 0x13131212, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniUpSampleScale2X_hi_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x15151414, 0x17171616, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            gpu_quantize_multiplier_16bit(scaleIn * scale / scaleOut, &multiplier, &postShift);
+            multAndoutZP[0] = (uint32_t)(multiplier);
+            multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * multiplier);
+
+            uniUpSampleScale2X_lo_2x8.data[7] |= (postShift & 0x1F);
+            uniUpSampleScale2X_hi_2x8.data[7] |= (postShift & 0x1F);
+
+            status  = vsi_nn_kernel_gpu_add_param( node, "uniUpScale2X_lo_2x8", &uniUpSampleScale2X_lo_2x8);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniUpScale2X_hi_2x8", &uniUpSampleScale2X_hi_2x8);
+            status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP);
+        }
+        break;
+        case _PACK_UPSCALE_KEY( F16, F16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( F16, I16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( F16, I8,  UP_ORG ):
+        case _PACK_UPSCALE_KEY( F16, U8,  UP_ORG ):
+        case _PACK_UPSCALE_KEY( I16, F16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( I16, I16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( I8,  F16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( I8,  I8,  UP_ORG ):
+        case _PACK_UPSCALE_KEY( U8,  F16, UP_ORG ):
+        case _PACK_UPSCALE_KEY( U8,  U8,  UP_ORG ):
+        {
+            float output_scale = scaleIn * scale / scaleOut;
+            float tail = output_ZP - input_ZP * output_scale;
+            gpu_dp_inst_t uniConvertDatatoF32_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertDatatoF32_4x4", &uniConvertDatatoF32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "tail", &tail);
+        }
+        break;
+        default:
+            break;
+    }
+
+#undef _PACK_UPSCALE_KEY
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (input_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release( &input_attr );
+        input_attr = NULL;
+    }
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release( &output_attr );
+        output_attr = NULL;
+    }
+
+    return status;
+} /* _upsamplescale_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t stride,
+    float   scale
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _upsamplescale_kernel_map;
+    vx_param_description_t * param_def  = _upsamplescale_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _upsamplescale_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _upsamplescale_initializer;
+    _internal_upscale_e flag = (stride == 2 && scale >= 0 ) ? UP_K2 : UP_ORG;
+
+    uint32_t key = 0;
+    int i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = UPSAMPLESCALE_HASH_KEY( in_dtype, out_dtype, flag );
+
+    for( i = 0; i < _cnt_of_array( _upsamplescale_kernel_map ); i ++ )
+    {
+        if( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if( i < _cnt_of_array( _upsamplescale_kernel_map ) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_UPSAMPLESCALE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" );
+    float   scale  = vsi_nn_kernel_param_get_float32( params, "scale" );
+
+    status = _query_kernel( kernel, inputs, outputs, stride, scale );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLESCALE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_STRIDE_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &stride );
+            node_params[SCALAR_SCALE_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLESCALE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_VALUE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] );
+            VSI_ASSERT( status == VSI_SUCCESS );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( upsamplescale, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
index f8d23a0..ef35bf7 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
@@ -24,6 +24,7 @@
 
 #include <stdint.h>
 #include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_math.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
@@ -53,7 +54,7 @@ static vsi_bool compute_gpu_divisor
     int32_t i = 0;
     for( i = vsi_nn_min( input_value, limit - 1 ); i > 0; i -- )
     {
-        if( ( i % gcd == 0 ) && ( input_value % i == 0 ) )
+        if ( ( i % gcd == 0 ) && ( input_value % i == 0 ) )
         {
             *divisor = i;
             return TRUE;
@@ -75,7 +76,7 @@ static size_t element_fill_dim
     if (size_x == 1)
         return 0;
 
-    if( size_x < GPU_TENSOR_MAX_WIDTH)
+    if ( size_x < GPU_TENSOR_MAX_WIDTH)
     {
         shape_x[rank_x] = size_x;
     }
@@ -85,7 +86,7 @@ static size_t element_fill_dim
         int32_t remainder = 0;
         compute_gpu_divisor( size_x, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
         remainder = size_x / divisor;
-        if( remainder > GPU_TENSOR_MAX_WIDTH || rank_x >= max_rank)
+        if ( remainder > GPU_TENSOR_MAX_WIDTH || rank_x >= max_rank)
         {
             // Cannot optimize.
             shape_x[rank_x] = size_x;
@@ -97,7 +98,7 @@ static size_t element_fill_dim
              * so it should be always 2.
              */
             cost_size = 2;
-            if( size_x > 1 )
+            if ( size_x > 1 )
             {
                 shape_x[rank_x]  = divisor;
                 shape_x[rank_x + 1] = remainder;
@@ -170,25 +171,25 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape
     rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize);
     rank_out += element_fill_dim(out_shape_output, rank_out, GPU_TENSOR_MAX_WIDTH, outerSize);
 
-    if( 0 == rank_in )
+    if ( 0 == rank_in )
     {
         out_shape_x[0] = 1;
         out_shape_x[1] = 1;
         rank_in = 2;
     }
-    else if( 1 == rank_in )
+    else if ( 1 == rank_in )
     {
         out_shape_x[1] = 1;
         rank_in = 2;
     }
 
-    if( 0 == rank_out )
+    if ( 0 == rank_out )
     {
         out_shape_output[0] = 1;
         out_shape_output[1] = 1;
         rank_out = 2;
     }
-    else if( 1 == rank_out )
+    else if ( 1 == rank_out )
     {
         out_shape_output[1] = 1;
         rank_out = 2;
@@ -200,6 +201,75 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape
     return ret;
 } /* vsi_nn_kernel_optimize_reduce_shape() */
 
+vsi_bool vsi_nn_kernel_optimize_tensor_shape
+    (
+    const int32_t* shape_x, const size_t rank_x,
+    const int32_t *axis, const size_t axis_size,
+    int32_t* out_shape_x, uint32_t* out_rank_x,
+    int32_t* out_axis, uint32_t* out_axis_size
+    )
+{
+    vsi_bool ret                        = TRUE;
+    size_t   i                          = 0;
+    size_t   rank_in                    = 0;
+    size_t   dims                       = 0;
+    int32_t  innerSize                  = 1;
+    int32_t  outerSize                  = 1;
+    int32_t  axisSize                   = 1;
+
+    for (i = 0; i < axis_size; i++)
+    {
+        axisSize *= shape_x[axis[i]];
+    }
+
+    for (i = 0; i < (size_t)axis[0]; i++)
+    {
+        innerSize *= shape_x[i];
+    }
+
+    for (i = axis[axis_size - 1] + 1; i < rank_x; i++)
+    {
+        outerSize *= shape_x[i];
+    }
+
+    rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, innerSize);
+    dims = element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, axisSize);
+    if (dims == 0)
+    {
+        out_axis[0] = (int32_t)rank_in;
+        *out_axis_size = 1;
+        out_shape_x[rank_in ++] = 1;
+    }
+    else
+    {
+        *out_axis_size = (uint32_t)dims;
+        for (i = 0; i < dims; i++)
+        {
+            out_axis[i] = (int32_t)rank_in + (int32_t)i;
+        }
+    }
+
+    rank_in += dims;
+
+    rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize);
+
+    if ( 0 == rank_in )
+    {
+        out_shape_x[0] = 1;
+        out_shape_x[1] = 1;
+        rank_in = 2;
+    }
+    else if ( 1 == rank_in )
+    {
+        out_shape_x[1] = 1;
+        rank_in = 2;
+    }
+
+    *out_rank_x = (uint32_t)rank_in;
+
+    return ret;
+} /* vsi_nn_kernel_optimize_reduce_shape() */
+
 vsi_bool vsi_nn_kernel_optimize_element_shape
     (
     const int32_t* shape_x, const size_t rank_x,
@@ -218,13 +288,13 @@ vsi_bool vsi_nn_kernel_optimize_element_shape
 
     rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, element_num);
 
-    if( 0 == rank_in )
+    if ( 0 == rank_in )
     {
         out_shape_x[0] = 1;
         out_shape_x[1] = 1;
         rank_in = 2;
     }
-    else if( 1 == rank_in )
+    else if ( 1 == rank_in )
     {
         out_shape_x[1] = 1;
         rank_in = 2;
@@ -275,13 +345,13 @@ vsi_bool vsi_nn_kernel_optimize_softmax_shape
 
     rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize);
 
-    if( 0 == rank_in )
+    if ( 0 == rank_in )
     {
         out_shape_x[0] = 1;
         out_shape_x[1] = 1;
         rank_in = 2;
     }
-    else if( 1 == rank_in )
+    else if ( 1 == rank_in )
     {
         out_shape_x[1] = 1;
         rank_in = 2;
@@ -313,7 +383,7 @@ static size_t tile_fill_dim
     size_t cost_size = 1;
     VSI_ASSERT( rank <= max_rank );
     VSI_ASSERT( size_output >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) );
-    if( size_output < GPU_TENSOR_MAX_WIDTH )
+    if ( size_output < GPU_TENSOR_MAX_WIDTH )
     {
         shape_x[rank] = size_x;
         shape_y[rank] = size_y;
@@ -325,7 +395,7 @@ static size_t tile_fill_dim
         int32_t remainder = 0;
         compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
         remainder = size_output / divisor;
-        if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
+        if ( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
         {
             // Cannot optimize.
             shape_x[rank] = size_x;
@@ -339,7 +409,7 @@ static size_t tile_fill_dim
              * so it should be always 2.
              */
             cost_size = 2;
-            if( size_x > 1 )
+            if ( size_x > 1 )
             {
                 shape_x[rank]  = divisor;
                 shape_x[rank + 1] = remainder;
@@ -349,7 +419,7 @@ static size_t tile_fill_dim
                 shape_x[rank] = 1;
                 shape_x[rank + 1] = 1;
             }
-            if( size_y > 1 )
+            if ( size_y > 1 )
             {
                 shape_y[rank]  = divisor;
                 shape_y[rank + 1] = remainder;
@@ -401,20 +471,20 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
         sz = shape_output[i];
         /*
          * Skip dim if the size is equal to 1
-         * Also skip if( sx == 1 && sy == 1 )
+         * Also skip if ( sx == 1 && sy == 1 )
          */
-        if( shape_output[i] == 1 )
+        if ( shape_output[i] == 1 )
         {
             continue;
         }
 
         // Update state
         state = TILE_STATE_EMPTY;
-        if( sx == sz )
+        if ( sx == sz )
         {
             state = TILE_STATE_NO_AXIS;
         }
-        else if( sx != sz )
+        else if ( sx != sz )
         {
             state = TILE_STATE_AXIS_X;
         }
@@ -472,16 +542,16 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
                 break;
         }
 #undef _pack_state
-        if( append_dim )
+        if ( append_dim )
         {
             dims += tile_fill_dim( out_shape_x, out_shape_y, out_shape_output,
                     dims, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
         }
     }
-    if( ret )
+    if ( ret )
     {
         /* Append the last dim */
-        if( i == rank_output )
+        if ( i == rank_output )
         {
             sx = effective_size_x;
             sy = effective_size_y;
@@ -490,7 +560,7 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
                     dims, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
         }
         /* Avoid 1D shape*/
-        if( 1 == dims )
+        if ( 1 == dims )
         {
             out_shape_x[1] = 1;
             out_shape_y[1] = 1;
@@ -508,3 +578,39 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
 #undef _swap_size
     return ret;
 } /* vsi_nn_kernel_optimize_eltwise_shape() */
+
+vsi_bool vsi_nn_kernel_optimize_1d_tensor_shape
+    (
+    const int32_t* shape, const uint32_t rank,
+    int32_t* out_shape, uint32_t* out_rank
+    )
+{
+    memcpy(out_shape, shape, sizeof(int32_t) * rank);
+    *out_rank = vsi_nn_max(rank, 2);
+
+    out_shape[1] = rank == 1 ? 1 : out_shape[1];
+
+    return TRUE;
+}
+
+vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
+    (
+    const int32_t* shape, const uint32_t rank,
+    int32_t* out_shape, uint32_t* out_rank
+    )
+{
+    uint32_t dim_num = 0;
+    uint32_t i = 0;
+
+    vsi_nn_kernel_optimize_1d_tensor_shape( shape,
+        rank, out_shape, &dim_num);
+
+    for (i = 3; i < dim_num; i++)
+    {
+        out_shape[2] *= out_shape[i];
+    }
+
+    *out_rank = vsi_nn_min(dim_num, 3);
+
+    return TRUE;
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/vx/clip_vx.c b/src/tim/vx/internal/src/kernel/vx/clip_vx.c
index 2c74303..3c4ab45 100644
--- a/src/tim/vx/internal/src/kernel/vx/clip_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/clip_vx.c
@@ -131,10 +131,8 @@ static vsi_nn_kernel_node_t _setup
     float index[1024] = {0};
     float value[1024] = {0};
 
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
-         inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32  ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
+         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
     {
         return NULL;
     }
diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c
index bb0d060..04e517c 100644
--- a/src/tim/vx/internal/src/kernel/vx/convolutional.c
+++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c
@@ -255,7 +255,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
     vx_node node = NULL;
     vx_nn_convolution_params_ext2_t vxparam;
     vx_tensor temp_tensors[3] = { NULL };
-    int i;
+    int32_t i;
     vsi_bool need_explicit_padding = FALSE;
 
     _build_vx_conv2d_param(
@@ -277,8 +277,17 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
 
     if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
     {
-        temp_tensors[1] = _expand_tensor_dim( inputs[1]->t,
-                (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
+        int32_t new_w_shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+        uint32_t new_w_rank = 4;
+        new_w_shape[0] = 1;
+        new_w_shape[1] = inputs[1]->attr.size[0];
+        new_w_shape[2] = 1;
+        for (i = 1; i < (int32_t)(inputs[1]->attr.dim_num); i++)
+        {
+            new_w_shape[2] *= inputs[1]->attr.size[i];
+        }
+        new_w_shape[3] = 1;
+        temp_tensors[1] = vxReshapeTensor( inputs[1]->t, new_w_shape, new_w_rank );
         CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final );
     }
     else
diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
index c78de9d..0a64be9 100644
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@@ -165,10 +165,8 @@ static vsi_nn_kernel_node_t _setup
     float index[1024] = {0};
     float value[1024] = {0};
 
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
-         inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32  ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
+         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32  )
     {
         return NULL;
     }
diff --git a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c b/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c
index 14ec73d..9afde85 100644
--- a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c
@@ -135,10 +135,8 @@ static vsi_nn_kernel_node_t _setup
     float index[1024] = {0};
     float value[1024] = {0};
 
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
-         inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32  ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
+         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
     {
         return NULL;
     }
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/layer_normalization.cl b/src/tim/vx/internal/src/libnnext/ops/cl/layer_normalization.cl
new file mode 100644
index 0000000..8bc826b
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/layer_normalization.cl
@@ -0,0 +1,143 @@
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_F32toF32(
+    __read_only image2d_array_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __write_only image2d_array_t  output,
+    float eps,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float e2InScale,
+    float scale_inOut,
+    float sumZpScale,
+    float zp2ScaleE2,
+    float sumZpScaleE2,
+    int width,
+    int height,
+    float dim_ratio
+    )
+{
+    int lidx = get_local_id(0);
+    int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0);
+
+    float4 data, dst;
+    float2 sumSqr = (float2)(0);
+    float scale_vari, bias_val;
+    __local float2 local_sum[16];
+
+    for(; coord.x < width;)
+    {
+        data = read_imagef(input, coord);
+        coord.x += 16;
+        sumSqr.x += data.x;
+        sumSqr.y += data.x * data.x;
+    }
+    local_sum[lidx] = sumSqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lidx == 0)
+    {
+        for(int i = 1; i < 16; i++)
+        {
+            sumSqr += local_sum[i];
+        }
+        local_sum[0] = sumSqr;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sumSqr = local_sum[0] * dim_ratio;
+    sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps;
+    sumSqr.s1 = rsqrt(sumSqr.s1);
+
+    for(coord.x = lidx; coord.x < width;)
+    {
+        float4 gamma = read_imagef(scale, coord.xw);
+        float4 beta  = read_imagef(bias, coord.xw);
+        data = read_imagef(input, coord);
+
+        scale_vari = gamma.s0 * sumSqr.s1;
+        bias_val = (beta.s0 - scale_vari * sumSqr.s0);
+
+        dst.x = data.x * scale_vari + bias_val;
+        write_imagef(output, coord, dst);
+        coord.x += 16;
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_U8toU8(
+    __read_only image2d_array_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __write_only image2d_array_t  output,
+    float eps,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float e2InScale,
+    float scale_inOut,
+    float sumZpScale,
+    float zp2ScaleE2,
+    float sumZpScaleE2,
+    int width,
+    int height,
+    float dim_ratio
+    )
+{
+    int lidx = get_local_id(0);
+    int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0);
+
+    uint4 data, dst;
+    float2 sumSqr;
+    uint tmpSum = 0, tmpSqr = 0;
+    float scale_vari, bias_val;
+    __local uint local_sum[1];
+    __local uint local_sqr[1];
+
+    if(lidx == 0)
+    {
+        local_sum[0] = 0;
+        local_sqr[0] = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(; coord.x < width;)
+    {
+        data = read_imageui(input, coord);
+        coord.x+=16;
+        tmpSum += data.x;
+        tmpSqr += data.x * data.x;
+    }
+    atom_add(local_sum, tmpSum);
+    atom_add(local_sqr, tmpSqr);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    tmpSum = local_sum[0];
+    tmpSqr = local_sqr[0];
+    //sumSqr.x = ((float)tmpSum - width * input_zp) * input_scale;
+    //sumSqr.y = ((float)tmpSqr - 2 * input_zp * (float)tmpSum + width * input_zp * input_zp) * e2InScale;
+    sumSqr.x = (float)tmpSum * input_scale - sumZpScale;
+    sumSqr.y = (float)tmpSqr * e2InScale - zp2ScaleE2 * (float)tmpSum + sumZpScaleE2;
+
+    sumSqr *= dim_ratio;
+    sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps;
+    sumSqr.s1 = rsqrt(sumSqr.s1);
+
+    for(coord.x = lidx; coord.x < width;)
+    {
+        float4 gamma = read_imagef(scale, coord.xw);
+        float4 beta  = read_imagef(bias, coord.xw);
+        data = read_imageui(input, coord);
+
+        scale_vari = gamma.s0 * sumSqr.s1;
+        float alpha = scale_inOut * scale_vari;
+        bias_val = (beta.s0 - scale_vari * sumSqr.s0) * output_scale + output_zp;
+
+        float tmpVal = data.x - input_zp;
+
+        float4 norm;
+        norm.x = tmpVal * alpha + bias_val;
+        dst = convert_uint4_rte(norm);
+        write_imageui(output, coord, dst);
+        coord.x+=16;
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl
index ec757ca..70a81da 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl
@@ -6,32 +6,30 @@ __kernel void gemm_F32F32toF32_2D(
     int K,
     int N,
     int ac2zero,
-    int bc2zero
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
     )
 {
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    int2 coord_a = (int2)(0, gidy);
-    int2 coord_b = (int2)(gidx, 0);
-
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
     float4 sum = (float4)(0);
 
-    for(; coord_a.x < K;)
+    for(; coord.z < K;)
     {
         float4 tempA0;
         float4 tempB0;
 
-        tempA0 = read_imagef(inputA, coord_a);
-        tempB0 = read_imagef(inputB, coord_b);
-        coord_a.x++;
-        coord_b.y++;
+        tempA0 = read_imagef(inputA, coord.zy);
+        tempB0 = read_imagef(inputB, coord.xz);
+        coord.z++;
 
-        sum += tempA0 * tempB0;
+        sum = sum + tempA0 * tempB0;
     }
-
-    coord_b.y = gidy;
-    write_imagef(output, coord_b, sum);
+    write_imagef(output, coord.xy, sum);
 }
 
 __kernel void gemm_F32F32toF32_3D(
@@ -42,7 +40,13 @@ __kernel void gemm_F32F32toF32_3D(
     int K,
     int N,
     int ac2zero,
-    int bc2zero
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
     )
 {
     int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);
@@ -60,10 +64,160 @@ __kernel void gemm_F32F32toF32_3D(
         coord_a.x++;
         coord_b.y++;
 
-        sum += tempA0 * tempB0;
+        sum = sum + tempA0 * tempB0;
     }
 
     coord_b.y = get_global_id(1);
     coord_b.z = get_global_id(2);
     write_imagef(output, coord_b, sum);
 }
+
+__kernel void gemm_transb_F32F32toF32_2D(
+    __read_only image2d_t   inputA,
+    __read_only image2d_t   inputB,
+    __write_only image2d_t  output,
+    int M,
+    int K,
+    int N,
+    int ac2zero,
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    float4 sum = (float4)(0);
+
+    for(; coord.z < K;)
+    {
+        float4 tempA0;
+        float4 tempB0;
+
+        tempA0 = read_imagef(inputA, coord.zy);
+        tempB0 = read_imagef(inputB, coord.zx);
+        coord.z++;
+
+        sum = sum + tempA0 * tempB0;
+    }
+    write_imagef(output, coord.xy, sum);
+}
+
+__kernel void gemm_transb_F32F32toF32_3D(
+    __read_only image2d_array_t   inputA,
+    __read_only image2d_array_t   inputB,
+    __write_only image2d_array_t  output,
+    int M,
+    int K,
+    int N,
+    int ac2zero,
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
+    )
+{
+    int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);
+
+    float4 sum = (float4)(0);
+
+    for(; coord_a.x < K;)
+    {
+        float4 tempA0;
+        float4 tempB0;
+
+        tempA0 = read_imagef(inputA, coord_a);
+        tempB0 = read_imagef(inputB, coord_b);
+        coord_a.x++;
+        coord_b.x++;
+
+        sum = sum + tempA0 * tempB0;
+    }
+
+    coord_a.x = get_global_id(0);
+    coord_a.z = get_global_id(2);
+    write_imagef(output, coord_b, sum);
+}
+
+__kernel void gemm_transb_F32I8toF32_2D(
+    __read_only image2d_t   inputA,
+    __read_only image2d_t   inputB,
+    __write_only image2d_t  output,
+    int M,
+    int K,
+    int N,
+    int ac2zero,
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    float4 sum = (float4)(0);
+    for(; coord.z < K;)
+    {
+        float4 tempA0;
+        float4 tempB0;
+
+        tempA0 = read_imagef(inputA, coord.zy);
+        tempB0 = convert_float4(read_imagei(inputB, coord.zx));
+        coord.z++;
+        tempB0.x = (tempB0.x - zp_b) * scale_b;
+
+        sum = sum + tempA0 * tempB0;
+    }
+
+    write_imagef(output, coord.xy, sum);
+}
+
+__kernel void gemm_transb_F32I8toF32_3D(
+    __read_only image2d_array_t   inputA,
+    __read_only image2d_array_t   inputB,
+    __write_only image2d_array_t  output,
+    int M,
+    int K,
+    int N,
+    int ac2zero,
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
+    )
+{
+    int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);
+
+    float4 sum = (float4)(0);
+
+    for(; coord_a.x < K;)
+    {
+        float4 tempA0;
+        float4 tempB0;
+
+        tempA0 = read_imagef(inputA, coord_a);
+        tempB0 = convert_float4(read_imagei(inputB, coord_b));
+        tempB0.x = (tempB0.x - zp_b) * scale_b;
+        coord_a.x++;
+        coord_b.x++;
+
+        sum = sum + tempA0 * tempB0;
+    }
+
+    coord_a.x = get_global_id(0);
+    coord_a.z = get_global_id(2);
+    write_imagef(output, coord_b, sum);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl
index 7c290d4..b7bc8ee 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl
@@ -6,32 +6,30 @@ __kernel void gemm_transa_F32F32toF32_2D(
     int K,
     int N,
     int ac2zero,
-    int bc2zero
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
     )
 {
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    int2 coord_a = (int2)(gidy, 0);
-    int2 coord_b = (int2)(gidx, 0);
-
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
     float4 sum = (float4)(0);
 
-    for(; coord_a.y < K;)
+    for(; coord.z < K;)
     {
         float4 tempA0;
         float4 tempB0;
 
-        tempA0 = read_imagef(inputA, coord_a);
-        tempB0 = read_imagef(inputB, coord_b);
-        coord_a.y++;
-        coord_b.y++;
+        tempA0 = read_imagef(inputA, coord.yz);
+        tempB0 = read_imagef(inputB, coord.xz);
+        coord.z++;
 
-        sum += tempA0 * tempB0;
+        sum = sum + tempA0 * tempB0;
     }
-
-    coord_b.y = gidy;
-    write_imagef(output, coord_b, sum);
+    write_imagef(output, coord.xy, sum);
 }
 
 __kernel void gemm_transa_F32F32toF32_3D(
@@ -42,7 +40,13 @@ __kernel void gemm_transa_F32F32toF32_3D(
     int K,
     int N,
     int ac2zero,
-    int bc2zero
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
     )
 {
     int gidx = get_global_id(0);
@@ -63,7 +67,7 @@ __kernel void gemm_transa_F32F32toF32_3D(
         coord_a.y++;
         coord_b.y++;
 
-        sum += tempA0 * tempB0;
+        sum = sum + tempA0 * tempB0;
     }
 
     coord_b.y = gidy;
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
new file mode 100644
index 0000000..feef55a
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
@@ -0,0 +1,108 @@
+inline float roi_align_1x1
+(
+    __read_only  image2d_array_t  input,
+                           float2 region_start,
+                           float2 region_end,
+                           float2 bin_size,
+                           int2   grid_size,
+                           float2 rcp_of_grid_size,
+                           int    pz
+)
+{
+    float sum = 0;
+
+    for(int iy = 0; iy < grid_size.y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size.x; ++ix)
+        {
+            float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);
+            float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;
+
+            int2 xy_low  = convert_int2(pos);
+            int2 xy_high = xy_low + 1;
+
+            float ly = pos.y - xy_low.y;
+            float lx = pos.x - xy_low.x;
+            float hy = 1.0f - ly;
+            float hx = 1.0f - lx;
+
+            float w1 = hy * hx;
+            float w2 = hy * lx;
+            float w3 = ly * hx;
+            float w4 = ly * lx;
+
+            float data1 = read_imagef(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;
+            float data2 = read_imagef(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;
+            float data3 = read_imagef(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;
+            float data4 = read_imagef(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;
+
+            sum = sum + w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+        }
+    }
+
+    return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);
+}
+
+
+#define EPS_GRID 0.00001f
+__kernel void roi_align_F32toF32
+(
+    __read_only  image2d_array_t  input,
+    __read_only  image2d_t        rois,
+    __read_only  image2d_t        n_rois,
+    __write_only image2d_array_t  output,
+                           float  spatial_x_scale,
+                           float  spatial_y_scale,
+                           float  in_width,
+                           float  in_height,
+                           float  rcp_of_out_width,
+                           float  rcp_of_out_height,
+                           float  sampling_x_ratio,
+                           float  sampling_y_ratio,
+                           int    depth
+)
+{
+    int px = get_global_id(0);
+    int py = get_global_id(1);
+    int pw = get_global_id(2);
+
+    int roi_batch = read_imagei(n_rois, (int2)(pw, 0)).x;
+    float4 roi_x = read_imagef(rois, (int2)(0, pw));
+    float4 roi_y = read_imagef(rois, (int2)(1, pw));
+    float4 roi_z = read_imagef(rois, (int2)(2, pw));
+    float4 roi_w = read_imagef(rois, (int2)(3, pw));
+    float4 roi = (float4)(roi_x.x, roi_y.x, roi_z.x, roi_w.x);
+
+    float4 roi_anchor = roi * (float4)(spatial_x_scale, spatial_y_scale, spatial_x_scale, spatial_y_scale);
+    float2 roi_dims = fmax(roi_anchor.zw - roi_anchor.xy, 1.0f);
+
+    float2 spatial_indx     = (float2)(px, py);
+    float2 pooled_dims      = (float2)(rcp_of_out_width, rcp_of_out_height);
+    float2 max_spatial_dims = (float2)(in_width, in_height);
+
+    float2 bin_size     = roi_dims * pooled_dims;
+    float2 region_start = spatial_indx * bin_size + roi_anchor.xy;
+    float2 region_end   = region_start + bin_size;
+
+    float2 roi_bin_grid = (float2)(sampling_x_ratio, sampling_y_ratio);
+
+    roi_bin_grid = roi_bin_grid == 0 ? ceil(bin_size - EPS_GRID) : roi_bin_grid;
+
+    int kz = roi_batch * depth;
+    float2 rcp_of_grid_size = 1.0f / roi_bin_grid;
+    int2 grid_size_xy = convert_int2(roi_bin_grid);
+    float4 interp;
+    int kz1 = pw * depth;
+    for (int pz = 0; pz < depth; pz ++, kz ++, kz1 ++)
+    {
+        interp.x = roi_align_1x1( input,
+                       region_start,
+                       region_end,
+                       bin_size,
+                       grid_size_xy,
+                       rcp_of_grid_size,
+                       kz);
+
+        write_imagef(output, (int4)(px, py, kz1, 0), interp);
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/space2depth_internal.cl b/src/tim/vx/internal/src/libnnext/ops/cl/space2depth_internal.cl
new file mode 100644
index 0000000..fc39817
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/space2depth_internal.cl
@@ -0,0 +1,90 @@
+
+__kernel void space2depth_internal_F32toF32 (
+        image2d_array_t    input,
+        image2d_array_t    output,
+        int block_size_x, int block_size_y,
+        float  scaleInOut, float zpInOut)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+    int inDepth = get_image_array_size(input);
+
+    int4 coord = (int4)(x, y, z, 0);
+    float4 data = {0.0};
+    data = read_imagef(input, coord);
+
+    ushort blockSize_x = convert_ushort(block_size_x);
+    ushort blockSize_y = convert_ushort(block_size_y);
+    int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0);
+    coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth
+                     + z;
+    write_imagef(output, coord_out, data);
+}
+
+__kernel void space2depth_internal_F32toF32_X2Y1 (
+        image2d_array_t    input,
+        image2d_array_t    output,
+        int block_size_x, int block_size_y,
+        float  scaleInOut, float zpInOut)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+    int inDepth = get_image_array_size(input);
+
+    int4 coord = (int4)(x, y, z, 0);
+    float4 data = {0.0};
+    data = read_imagef(input, coord);
+
+    int4 coord_out = (int4)(x >> 1, y, 0, 0);
+    coord_out.z = (x & 1) * inDepth + z;
+    write_imagef(output, coord_out, data);
+}
+
+__kernel void space2depth_internal_U8toU8 (
+        image2d_array_t    input,
+        image2d_array_t    output,
+        int block_size_x, int block_size_y,
+        float  scaleInOut, float zpInOut)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+    int inDepth = get_image_array_size(input);
+
+    int4 coord = (int4)(x, y, z, 0);
+    uint4 data = {0};
+    data = read_imageui(input, coord);
+
+    ushort blockSize_x = convert_ushort(block_size_x);
+    ushort blockSize_y = convert_ushort(block_size_y);
+    int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0);
+    coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth
+                    + z;
+
+    data.x = convert_uint(data.x * scaleInOut + zpInOut);
+    write_imageui(output, coord_out, data);
+}
+
+__kernel void space2depth_internal_U8toU8_X2Y1 (
+        image2d_array_t    input,
+        image2d_array_t    output,
+        int block_size_x, int block_size_y,
+        float  scaleInOut, float zpInOut)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+    int inDepth = get_image_array_size(input);
+
+    int4 coord = (int4)(x, y, z, 0);
+    uint4 data = {0};
+    data = read_imageui(input, coord);
+
+    int4 coord_out = (int4)(x >> 1, y, 0, 0);
+    coord_out.z = (x & 1) * inDepth + z;
+
+    data.x = convert_uint(data.x * scaleInOut + zpInOut);
+    write_imageui(output, coord_out, data);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c
deleted file mode 100644
index 3ab7764..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-void myTensorCropFunc
-    (
-    int8_t *src,
-    int8_t *dst
-    )
-{
-
-    return;
-}
-vsi_status VX_CALLBACK TensorCropInternalKernel
-    (vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if(paramNum == 2)
-    {
-
-    }
-
-    return status;
-}
-
-vsi_status VX_CALLBACK TensorCropInitializer
-    (vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    vsi_status status   = VX_SUCCESS;
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        3,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in threads
-        {0, 0, 0}}; // globalWorkSize: image size in threads
-
-    vx_tensor input             = (vx_tensor)paramObj[0];
-    vx_tensor output            = (vx_tensor)paramObj[1];
-    uint32_t output_size[4]     = {1, 1, 1, 1};
-    vsi_enum dataFormat, dstFormat;
-    int8_t  input_fixPointPos   = 0;
-    vx_uint32 i  =  0;
-    int32_t offset[3];
-    size_t size[DIM_SIZE];
-    vsi_nn_tensor_attr_t attr[2];
-
-    memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-
-    status  = vsi_nn_vxGetTensorAttr(input, &attr[0]);
-    status |= vsi_nn_vxGetTensorAttr(output, &attr[1]);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-
-    dataFormat        = attr[0].dtype.vx_type;
-    input_fixPointPos = attr[0].dtype.fl;
-    dstFormat         = attr[1].dtype.vx_type;
-    for (i = 0; i < attr[1].dim_num; i++)
-    {
-        output_size[i] = attr[1].size[i];
-    }
-
-    vxCopyScalar((vx_scalar)paramObj[2], &offset[0], VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[3], &offset[1], VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[4], &offset[2], VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    memset(size, 0, sizeof(size_t) * DIM_SIZE);
-    switch(dstFormat)
-    {
-    case VSI_NN_TYPE_INT8:
-    case VSI_NN_TYPE_UINT8:
-        size[0] = 16;
-        size[1] = 4;
-        break;
-    case VSI_NN_TYPE_INT16:
-    case VSI_NN_TYPE_UINT16:
-    case VSI_NN_TYPE_FLOAT16:
-        size[0] = 8;
-        size[1] = 4;
-        break;
-    }
-
-    shaderParam.globalWorkOffset[0] = offset[0];
-    shaderParam.globalWorkOffset[1] = offset[1];
-    shaderParam.globalWorkOffset[2] = offset[2];
-    shaderParam.globalWorkScale[0]  = size[0];
-    shaderParam.globalWorkScale[1]  = size[1];
-    shaderParam.globalWorkScale[2]  = 1;
-    shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1)
-        / shaderParam.globalWorkScale[0], 4);
-    shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1];
-    shaderParam.globalWorkSize[2] = output_size[2];
-
-    if(dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        vx_uint32 uniConvertInt16toFp16_2x8[16] = {
-            0x11111111, // TCfg
-            0x00000000, // ASelt
-            0x03020100, 0x07060504, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        };
-
-#define cropMIN(x, y)            (((x) <= (y)) ?  (x) :  (y))
-#define CROP_MAX_POST_SHIFT_BITS     (31)
-#define CROP_MAX_MULTIPLIER_NUM      (65535)
-
-        if (input_fixPointPos > 0)
-        {
-            vx_uint8  postshift      = cropMIN(input_fixPointPos, CROP_MAX_POST_SHIFT_BITS);
-
-            uniConvertInt16toFp16_2x8[7] |= (postshift & 0x1F);
-        }
-        else
-        {
-            vx_uint32 multiplier = cropMIN((int64_t)1 << (-input_fixPointPos), CROP_MAX_MULTIPLIER_NUM);
-
-            for (i = 0; i < 8; i++)
-            {
-                uniConvertInt16toFp16_2x8[i + 8] = multiplier;
-            }
-        }
-#undef cropMIN
-#undef CROP_MAX_POST_SHIFT_BITS
-#undef CROP_MAX_MULTIPLIER_NUM
-
-        status |= vxSetNodeUniform(nodObj, "uniConvertInt16toFp16_2x8", 1, uniConvertInt16toFp16_2x8);
-    }
-
-    vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    if(status < 0)
-    {
-        VSILOGE("[%s : %d]Initializer  failure! \n",__FILE__, __LINE__);
-    }
-    return status;
-}
-
-vx_param_description_t basekernel_tensorCrop_params[] = {
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxTensorCropKernelInt16Info =
-{
-    VX_KERNEL_ENUM_TENSORCROP_INT16,
-    VX_KERNEL_NAME_TENSORCROP_INT16,
-    NULL,
-    basekernel_tensorCrop_params,
-    (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    TensorCropInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTensorCropKernelInt8Info =
-{
-    VX_KERNEL_ENUM_TENSORCROP_INT8,
-    VX_KERNEL_NAME_TENSORCROP_INT8,
-    NULL,
-    basekernel_tensorCrop_params,
-    (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    TensorCropInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTensorCropKernelInt16Fp16Info =
-{
-    VX_KERNEL_ENUM_TENSORCROP_INT16_FP16,
-    VX_KERNEL_NAME_TENSORCROP_INT16_FP16,
-    NULL,
-    basekernel_tensorCrop_params,
-    (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    TensorCropInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_CROP_list[] =
-{
-    NULL,
-    &vxTensorCropKernelInt16Info,
-    &vxTensorCropKernelInt8Info,
-    &vxTensorCropKernelInt16Fp16Info,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c
deleted file mode 100644
index dacde22..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_FCL2)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_FULLYCONNECTED_AXIS2)
-#define _VX_KERNEL_NAME         ("vsi_nn_kernel_fullconnect2")
-#define _VX_KERNEL_FUNC_KERNEL  (vxFullconnect2Kernel)
-
-//static uint32_t layerNum = 0;
-
-static vsi_status VX_CALLBACK vxFullconnect2Kernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    /* TODO: */
-#define ARG_NUM            (2)
-#define TENSOR_NUM_INPUT (3)
-#define TENSOR_NUM_OUTPUT (1)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VX_SUCCESS;
-    uint32_t  i, j, k;
-    vx_context context = NULL;
-    vsi_nn_tensor_attr_t attr[TENSOR_NUM];
-    uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
-    vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
-    uint8_t *buffer_ptr[TENSOR_NUM] = {NULL};
-    vx_tensor tensor[TENSOR_NUM];
-
-    //char fileName[256] = {'\0'};
-    //uint32_t total_size;
-    int32_t axis, weights;
-    uint32_t num_fc = 1, num_no_fc = 1;
-
-
-    //prepare data
-    context = vxGetContext((vx_reference)node);
-
-    for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
-    }
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
-    }
-
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(axis),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(weights),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    //op calc
-    for(i = 0; i <= (uint32_t)axis; ++i)
-    {
-        num_fc *= attr[0].size[i];
-    }
-    for(i = axis + 1; i < attr[0].dim_num; ++i)
-    {
-        num_no_fc *= attr[0].size[i];
-    }
-
-    for(k = 0; k < num_no_fc; ++k)
-    {
-        for(j = 0; j < (uint32_t)weights; ++j)
-        {
-            float sum;
-            vsi_nn_DtypeToFloat32(&buffer_ptr[2][stride_size[2][0] * j], &sum, &attr[2].dtype);
-            for(i = 0; i < num_fc; ++i)
-            {
-                float x, w;
-                vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * (i + num_fc * k)],
-                    &x, &attr[0].dtype);
-                vsi_nn_DtypeToFloat32(&buffer_ptr[1][stride_size[1][0] * (i + num_fc * j)],
-                    &w, &attr[1].dtype);
-                sum += w * x;
-            }
-            vsi_nn_Float32ToDtype(sum, &buffer_ptr[3][stride_size[3][0] * (j + weights * k)],
-                &attr[3].dtype);
-        }
-    }
-
-#if 0
-    print_index = 3;
-    total_size = vsi_nn_ShapeProduct(size[print_index], dim_num[print_index]);
-    if (dim_num[print_index] == 3)
-    {
-        snprintf(fileName, VSI_NN_MAX_PATH, "%s_%d_%d_%d_%d.txt", _VX_KERNEL_NAME, layerNum,
-            size[print_index][0], size[print_index][1], size[print_index][2]);
-    }
-    else
-    {
-        snprintf(fileName, VSI_NN_MAX_PATH, "%s_%d_%d_%d_%d_%d.txt", _VX_KERNEL_NAME, layerNum,
-            size[print_index][0], size[print_index][1], size[print_index][2], size[print_index][3]);
-    }
-    vsi_nn_SaveDataToText(fileName, buffer_ptr[print_index], total_size,
-        data_format[print_index], NULL);
-    layerNum++;
-#endif
-    //save data
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
-        if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
-    }
-    for( i = 0; i < TENSOR_NUM; i ++ )
-    {
-        if (buffer_ptr[i]) free(buffer_ptr[i]);
-    }
-
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-};
-
-void myFullyConnected_Axis2Func
-    (
-    int8_t *src,
-    int8_t *dst
-    )
-{
-
-    return;
-}
-vsi_status VX_CALLBACK vxFullyConnected_Axis2Kernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if(paramNum == 2)
-    {
-
-    }
-
-    return status;
-}
-
-vsi_status VX_CALLBACK vxFullyConnected_Axis2Initializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    vsi_status status   = VX_SUCCESS;
-
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        2,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in threads
-        {0, 0, 0}}; // globalWorkSize: image size in threads
-
-    uint32_t       input_size[DIM_SIZE] = {1, 1, 1, 1};
-    uint32_t       output_size[DIM_SIZE] = {1, 1, 1, 1};
-
-    uint32_t uniMulAcc_16x1[16] = {
-        0x00005555, // TCfg
-        0x00000000, // ASelt
-        0x76543210, 0x00000000, // ABin
-        0x00005555, // BSelt
-        0x76543210, 0x00000000, // BBin
-        0x00000400, // AccumType, ConstantType, and PostShift
-        0x00000000, 0x00000000, 0x00000000, 0x00000000,
-        0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-    };
-    uint32_t loopNum = 0;
-    vsi_nn_tensor_attr_t attr[2];
-    uint32_t i;
-    uint32_t input_dims      = 0;
-    uint32_t output_dims     = 0;
-
-    memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-
-    status  = vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[1], &attr[0]);
-    status |= vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[3], &attr[1]);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-        return status;
-    }
-    input_dims  = attr[0].dim_num;
-    for (i = 0; i < input_dims; i++)
-    {
-        input_size[i] = attr[0].size[i];
-    }
-    output_dims  = attr[1].dim_num;
-    for (i = 0; i < output_dims; i++)
-    {
-        output_size[i] = attr[1].size[i];
-    }
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkScale[0] = 1;
-    shaderParam.globalWorkScale[1] = 1;
-    shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1)
-        / shaderParam.globalWorkScale[0], 4);
-    shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1];
-
-    vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    vxSetNodeUniform(nodObj, "uniMulAcc_16x1", 1, uniMulAcc_16x1);
-
-    loopNum = gcmALIGN(input_size[0], 32);
-    vxSetNodeUniform(nodObj, "loopNum", 1, &loopNum);
-    if(status < 0)
-    {
-        VSILOGE("[%s : %d]Initializer  failure! \n",__FILE__, __LINE__);
-    }
-    return status;
-}
-
-static vx_param_description_t vxFullyConnected_Axis2KernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxFullyConnected_Axis2KernelInfo =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    vxFullyConnected_Axis2Kernel,
-    vxFullyConnected_Axis2KernelParam,
-    (sizeof(vxFullyConnected_Axis2KernelParam) / sizeof(vxFullyConnected_Axis2KernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxFullyConnected_Axis2Initializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_FCL2_list[] =
-{
-    &_VX_KERNEL_VAR,
-    &vxFullyConnected_Axis2KernelInfo,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c
deleted file mode 100644
index f259835..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c
+++ /dev/null
@@ -1,688 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-void myLayerNormFunc
-    (
-    void* src,
-    int16_t* scale,
-    float*   bias,
-    float    eps,
-    void* dst,
-    uint32_t input_dim,
-    uint32_t width,
-    uint32_t height,
-    uint32_t channel,
-    uint32_t batch
-    )
-{
-    uint32_t ch = (input_dim <= 2) ? 1 : channel;
-    uint32_t bn = (input_dim <= 3) ? 1 : batch;
-    uint32_t b = 0, c = 0, h = 0, w = 0;
-
-    int16_t* imgIn, *imgOut;
-    imgIn = (int16_t*)src;
-    imgOut = (int16_t*)dst;
-
-    VSILOGI("Hello myLayerNormFunc!\n");
-    for (b = 0; b < bn; b++)
-    {
-        for (c = 0; c < ch; c++)
-        {
-            for (h = 0; h < height; h++)
-            {
-                uint32_t len = (h + (c + b*ch)*height) * width;
-                float sum = .0f;
-                float sumsq = .0f;
-                float mean = .0f;
-                float vari = .0f;
-
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    sum += vsi_nn_Fp16toFp32(imgIn[index]);
-                }
-                mean = sum / width;
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
-                    sumsq += data * data;
-                }
-                vari = sumsq / width;
-                vari = (float)(1.0 / sqrtf(vari + eps));
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
-                    float scaleVal = vsi_nn_Fp16toFp32(scale[w]);
-                    float biasVal = bias[w];
-                    float normVal = data * vari * scaleVal + biasVal;
-                    imgOut[index] = vsi_nn_Fp32ToFp16(normVal);
-                }
-            }
-        }
-    }
-    return;
-}
-void myLayerNormFunc_u8
-    (
-    void* src,
-    int16_t* scale,
-    float*   bias,
-    float    eps,
-    void* dst,
-    uint32_t input_dim,
-    uint32_t width,
-    uint32_t height,
-    uint32_t channel,
-    uint32_t batch,
-    int32_t inZp,
-    int32_t outZp,
-    float inScale,
-    float outScale
-    )
-{
-    uint32_t ch = (input_dim <= 2) ? 1 : channel;
-    uint32_t bn = (input_dim <= 3) ? 1 : batch;
-    uint32_t b = 0, c = 0, h = 0, w = 0;
-
-    uint8_t* imgIn, *imgOut;
-    imgIn = (uint8_t*)src;
-    imgOut = (uint8_t*)dst;
-
-    VSILOGI("Hello myLayerNormFunc!\n");
-    for (b = 0; b < bn; b++)
-    {
-        for (c = 0; c < ch; c++)
-        {
-            for (h = 0; h < height; h++)
-            {
-                uint32_t len = (h + (c + b*ch)*height) * width;
-                float sum = .0f;
-                float sumsq = .0f;
-                float mean = .0f;
-                float vari = .0f;
-
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    //sum += vsi_nn_Fp16toFp32(imgIn[index]);
-                    sum += vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8);
-                }
-                mean = sum / width;
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    //float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
-                    float data = vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8) - mean;
-                    sumsq += data * data;
-                }
-                vari = sumsq / width;
-                vari = (float)(1.0 / sqrtf(vari + eps));
-                for (w = 0; w < width; w++)
-                {
-                    uint32_t index = len + w;
-                    //float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
-                    float data = vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8) - mean;
-                    float scaleVal = vsi_nn_Fp16toFp32(scale[w]);
-                    float biasVal = bias[w];
-                    float normVal = data * vari * scaleVal + biasVal;
-                    //imgOut[index] = vsi_nn_Fp32ToFp16(normVal);
-                    imgOut[index] = (vx_uint8)vsi_nn_Fp32ToAffine(normVal, outScale, outZp, VSI_NN_TYPE_UINT8);
-                }
-            }
-        }
-    }
-    return;
-}
-vsi_status VX_CALLBACK vxLayerNormKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if(paramNum == 5)
-    {
-        vx_context context = NULL;
-        // tensor
-        vx_tensor imgObj[4] = { NULL };
-        vsi_nn_tensor_attr_t attr[4];
-        int16_t *input = NULL, *output = NULL, *scale = NULL;
-        float *bias = NULL;
-        uint32_t input_size[4] = {1, 1, 1, 1}, output_size[4] = {1, 1, 1, 1};
-        uint32_t scale_size[4] = {1, 1, 1, 1}, bias_size[4] = {1, 1, 1, 1};
-        uint32_t input_stride_size[4]  = {0};
-        uint32_t output_stride_size[4] = {0};
-        uint32_t scale_stride_size[4]  = {0};
-        uint32_t bias_stride_size[4] = {0};
-        vx_tensor_addressing input_user_addr = NULL;
-        vx_tensor_addressing output_user_addr = NULL;
-        vx_tensor_addressing scale_user_addr = NULL;
-        vx_tensor_addressing bias_user_addr = NULL;
-        vsi_nn_type_e inputFormat = VSI_NN_TYPE_FLOAT16, outputFormat = VSI_NN_TYPE_FLOAT16;
-        vsi_nn_type_e scaleFormat = VSI_NN_TYPE_FLOAT16, biasFormat = VSI_NN_TYPE_FLOAT16;
-        uint32_t input_dims = 0, output_dims = 0;
-        uint32_t scale_dims = 0, bias_dims = 0;
-        uint32_t i;
-        int32_t in_zp, out_zp;
-        float in_scale, out_scale;
-        // scalar
-        vx_scalar scalar[1] = { NULL };
-        float eps = .0f;
-
-        imgObj[0] = (vx_tensor)paramObj[0];
-        imgObj[1] = (vx_tensor)paramObj[1];
-        imgObj[2] = (vx_tensor)paramObj[2];
-        imgObj[3] = (vx_tensor)paramObj[3];
-        scalar[0] = (vx_scalar)paramObj[4];
-        memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-        memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-        memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t));
-        memset(&attr[3], 0, sizeof(vsi_nn_tensor_attr_t));
-        context = vxGetContext((vx_reference)node);
-        if (context == NULL)
-        {
-            VSILOGE("vxGetContext failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-
-        status  = vsi_nn_vxGetTensorAttr(imgObj[0], &attr[0]);
-        status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]);
-        status |= vsi_nn_vxGetTensorAttr(imgObj[2], &attr[2]);
-        status |= vsi_nn_vxGetTensorAttr(imgObj[3], &attr[3]);
-        if (status != VX_SUCCESS)
-        {
-            VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-        input_dims  = attr[0].dim_num;
-        inputFormat = attr[0].dtype.vx_type;
-        for (i = 0; i < input_dims; i++)
-        {
-            input_size[i] = attr[0].size[i];
-        }
-        in_zp = attr[0].dtype.zero_point;
-        in_scale = attr[0].dtype.scale;
-
-        //bias
-        bias_dims  = attr[1].dim_num;
-        biasFormat = attr[1].dtype.vx_type;
-        for (i = 0; i < bias_dims; i++)
-        {
-            bias_size[i] = attr[1].size[i];
-        }
-        //scale
-        scale_dims  = attr[2].dim_num;
-        scaleFormat = attr[2].dtype.vx_type;
-        for (i = 0; i < scale_dims; i++)
-        {
-            scale_size[i] = attr[2].size[i];
-        }
-
-        //output
-        output_dims  = attr[3].dim_num;
-        outputFormat = attr[3].dtype.vx_type;
-        for (i = 0; i < output_dims; i++)
-        {
-            output_size[i] = attr[3].size[i];
-        }
-        out_zp = attr[3].dtype.zero_point;
-        out_scale = attr[3].dtype.scale;
-
-        input_size[2] = (input_dims <= 2)?1:input_size[2];
-        input_size[3] = (input_dims <= 3)?1:input_size[3];
-
-        input_stride_size[0]  = vsi_nn_GetTypeBytes(inputFormat);
-        output_stride_size[0] = vsi_nn_GetTypeBytes(outputFormat);
-        for (i=1; i< input_dims; i++)
-        {
-            input_stride_size[i]  = input_stride_size[i-1] * input_size[i-1];
-            output_stride_size[i] = output_stride_size[i-1] * output_size[i-1];
-        }
-        input  = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t));
-        output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t));
-        input_user_addr = vxCreateTensorAddressing(context, input_size, input_stride_size, (vx_uint8)input_dims);
-        vsi_nn_copy_tensor_patch(imgObj[0], &attr[0], input, VX_READ_ONLY);
-        //scale and bias
-        scale_stride_size[0]  = vsi_nn_GetTypeBytes(scaleFormat);
-        bias_stride_size[0] = vsi_nn_GetTypeBytes(biasFormat);
-        for (i=1; i< scale_dims; i++)
-        {
-            scale_stride_size[i]  = scale_stride_size[i-1] * scale_size[i-1];
-            bias_stride_size[i] = bias_stride_size[i-1] * bias_size[i-1];
-        }
-        scale  = (int16_t*)malloc(scale_size[0]*sizeof(int16_t));
-        bias = (float*)malloc(bias_size[0]*sizeof(float));
-        bias_user_addr = vxCreateTensorAddressing(context, bias_size, bias_stride_size, (vx_uint8)bias_dims);
-        vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], bias, VX_READ_ONLY);
-        scale_user_addr = vxCreateTensorAddressing(context, scale_size, scale_stride_size, (vx_uint8)scale_dims);
-        vsi_nn_copy_tensor_patch(imgObj[2], &attr[2], scale, VX_READ_ONLY);
-
-        // scalar
-        status = vxCopyScalar(scalar[0], &eps, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        if (status != VX_SUCCESS)
-        {
-            VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-        // Call C Prototype
-        if(inputFormat == VSI_NN_TYPE_FLOAT16)
-        {
-            myLayerNormFunc(input, scale, bias, eps, output, input_dims, input_size[0],
-                input_size[1], input_size[2], input_size[3]);
-        }
-        else
-        {
-            myLayerNormFunc_u8(input, scale, bias, eps, output, input_dims, input_size[0],
-                input_size[1], input_size[2], input_size[3], in_zp, out_zp, in_scale, out_scale);
-        }
-
-        //output tensor
-        output_user_addr = vxCreateTensorAddressing(context, output_size,
-            output_stride_size, (vx_uint8)output_dims);
-        vsi_nn_copy_tensor_patch(imgObj[3], &attr[3], output, VX_WRITE_ONLY);
-
-OnError:
-        if(input) free(input);
-        if(scale) free(scale);
-        if(bias) free(bias);
-        if(output) free(output);
-        if(input_user_addr) vxReleaseTensorAddressing(&input_user_addr);
-        if(scale_user_addr) vxReleaseTensorAddressing(&scale_user_addr);
-        if(bias_user_addr) vxReleaseTensorAddressing(&bias_user_addr);
-        if(output_user_addr) vxReleaseTensorAddressing(&output_user_addr);
-    }
-
-    return status;
-}
-vsi_status VX_CALLBACK vxLayerNormInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        3,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-
-    vx_tensor     input           = (vx_tensor)paramObj[0];
-    vx_tensor     scale           = (vx_tensor)paramObj[2];
-    vx_tensor     output          = (vx_tensor)paramObj[3];
-    uint32_t      input_size[4]   = {1, 1, 1, 1};
-    uint32_t      input_dims      = 0;
-    vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16;
-    vsi_nn_type_e scaleDataFormat = VSI_NN_TYPE_FLOAT16;
-    vsi_nn_type_e outputDataFormat = VSI_NN_TYPE_FLOAT16;
-    vx_float32 scaleIn = 0;
-    vx_float32 scaleOut = 0;
-    vx_float32 reScaleOut_u8 = 0;
-    vx_float32 reOutZP = 0.f;
-    int32_t output_ZP = 0;
-    int32_t input_ZP = 0;
-    vx_uint32 iter = 0;
-    int32_t sumInZp = 0;
-    int32_t tmpZp1 = 0;
-    int32_t tmpZp2 = 0;
-    vx_float32 e2InScale = 0;
-    vsi_nn_tensor_attr_t attr[3];
-    uint32_t i;
-
-    memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t));
-    status  = vsi_nn_vxGetTensorAttr(input, &attr[0]);
-    status |= vsi_nn_vxGetTensorAttr(output, &attr[1]);
-    status |= vsi_nn_vxGetTensorAttr(scale, &attr[2]);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-        return status;
-    }
-
-    input_dims  = attr[0].dim_num;
-    inputDataFormat = attr[0].dtype.vx_type;
-    for (i = 0; i < input_dims; i++)
-    {
-        input_size[i] = attr[0].size[i];
-    }
-    input_ZP = attr[0].dtype.zero_point;
-    scaleIn = attr[0].dtype.scale;
-    outputDataFormat = attr[1].dtype.vx_type;
-    output_ZP = attr[1].dtype.zero_point;
-    scaleOut = attr[1].dtype.scale;
-    scaleDataFormat = attr[2].dtype.vx_type;
-
-    if(outputDataFormat == VSI_NN_TYPE_UINT8)
-    {
-        reScaleOut_u8 = 1.0f / scaleOut;
-        reOutZP = (vx_float32)output_ZP;
-    }
-    iter = ((input_size[0] + 15) / 16) * 16;
-    sumInZp = input_ZP * iter * (-1);
-    tmpZp1 = (-2) * input_ZP;
-    tmpZp2 = iter * input_ZP * input_ZP;
-    e2InScale = scaleIn * scaleIn;
-
-    input_size[2] = (input_dims <= 2)?1:input_size[2];
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkOffset[2] = 0;
-    shaderParam.globalWorkScale[0]  = input_size[0];
-    shaderParam.globalWorkScale[1]  = 1;
-    shaderParam.globalWorkScale[2]  = 1;
-    shaderParam.globalWorkSize[0]   = 1;
-    shaderParam.globalWorkSize[1]   = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1], 4);
-    shaderParam.globalWorkSize[2]   = input_size[2];
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-    if(status < 0)
-    {
-        VSILOGE("[%s : %d]Initializer  failure! \n",__FILE__, __LINE__);
-    }
-    {
-        vx_float32 dimRatio = 1.0f / (vx_float32)input_size[0];
-        vx_uint32 uniFp16SumSqr_dp8x2[16] = {
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0x76543210, // ABin
-            0x5555aaaa, // BSelt
-            0x00000000, 0x76543210, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        };
-        vx_uint32 UniFP16toFP32Lo4_dp4x4[16] = {
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        };
-        vx_uint32 uniExtractHalf4_dp4x4[16] = {
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00020000, 0x00060004, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvertSecFp16Fp32_4x4[16] = {
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        };
-        vx_uint32 uniSumU8_16x1[16] = {
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0xfedcba98, // ABin
-            0xaaaaaaaa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
-        };
-        vx_uint32 uniSqrSum_16x1[16] = {
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0xfedcba98, // ABin
-            0x55555555, // BSelt
-            0x76543210, 0xfedcba98, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvert1stUint8SubZpToFp32_4x4[16] = {
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvert2ndUint8SubZpToFp32_4x4[16] = {
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvert3rdUint8SubZpToFp32_4x4[16] = {
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00090008, 0x000b000a, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvert4thUint8SubZpToFp32_4x4[16] = {
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x000d000c, 0x000f000e, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        };
-        vx_uint32 uniConvertInt32toUint8_2x8[16] = {
-            0x33333333, // TCfg
-            0x11110000, // ASelt
-            0x03020100, 0x03020100, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        };
-        vx_uint32 UniPackFP16even_2x8[16] = {
-           0x11111111, // TCfg
-           0x11110000, // ASelt
-           0x06040200, 0x06040200, // ABin
-           0x22222222, // BSelt
-           0x00000000, 0x00000000, // BBin
-           0x00000100, // AccumType, ConstantType, and PostShift
-           0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
-        };
-        if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_FLOAT16
-            && scaleDataFormat == VSI_NN_TYPE_FLOAT16)
-        {
-            status  = vxSetNodeUniform(nodObj, "width", 1, &input_size[0]);
-            status |= vxSetNodeUniform(nodObj, "dimRatio", 1, &dimRatio);
-            status |= vxSetNodeUniform(nodObj, "UniFP16toFP32Lo4_dp4x4", 1, UniFP16toFP32Lo4_dp4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvertSecFp16Fp32_4x4", 1, uniConvertSecFp16Fp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniSumU8_16x1", 1, uniSumU8_16x1);
-            status |= vxSetNodeUniform(nodObj, "uniSqrSum_16x1", 1, uniSqrSum_16x1);
-            status |= vxSetNodeUniform(nodObj, "uniConvert1stUint8SubZpToFp32_4x4", 1, uniConvert1stUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert2ndUint8SubZpToFp32_4x4", 1, uniConvert2ndUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert3rdUint8SubZpToFp32_4x4", 1, uniConvert3rdUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert4thUint8SubZpToFp32_4x4", 1, uniConvert4thUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "inputZP", 1, &input_ZP);
-            status |= vxSetNodeUniform(nodObj, "input_scale", 1, &scaleIn);
-            status |= vxSetNodeUniform(nodObj, "sumInZp", 1, &sumInZp);
-            status |= vxSetNodeUniform(nodObj, "tmpZp1", 1, &tmpZp1);
-            status |= vxSetNodeUniform(nodObj, "tmpZp2", 1, &tmpZp2);
-            status |= vxSetNodeUniform(nodObj, "e2InScale", 1, &e2InScale);
-            status |= vxSetNodeUniform(nodObj, "UniPackFP16even_2x8", 1, UniPackFP16even_2x8);
-        }
-        else
-        {
-            status  = vxSetNodeUniform(nodObj, "uniFp16SumSqr_dp8x2", 1, uniFp16SumSqr_dp8x2);
-            status |= vxSetNodeUniform(nodObj, "width", 1, &input_size[0]);
-            status |= vxSetNodeUniform(nodObj, "dimRatio", 1, &dimRatio);
-            status |= vxSetNodeUniform(nodObj, "UniFP16toFP32Lo4_dp4x4", 1, UniFP16toFP32Lo4_dp4x4);
-            status |= vxSetNodeUniform(nodObj, "uniExtractHalf4_dp4x4", 1, uniExtractHalf4_dp4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvertInt32toUint8_2x8", 1, uniConvertInt32toUint8_2x8);
-            status |= vxSetNodeUniform(nodObj, "uniConvertSecFp16Fp32_4x4", 1, uniConvertSecFp16Fp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniSumU8_16x1", 1, uniSumU8_16x1);
-            status |= vxSetNodeUniform(nodObj, "uniSqrSum_16x1", 1, uniSqrSum_16x1);
-            status |= vxSetNodeUniform(nodObj, "uniConvert1stUint8SubZpToFp32_4x4", 1, uniConvert1stUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert2ndUint8SubZpToFp32_4x4", 1, uniConvert2ndUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert3rdUint8SubZpToFp32_4x4", 1, uniConvert3rdUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "uniConvert4thUint8SubZpToFp32_4x4", 1, uniConvert4thUint8SubZpToFp32_4x4);
-            status |= vxSetNodeUniform(nodObj, "inputZP", 1, &input_ZP);
-            status |= vxSetNodeUniform(nodObj, "output_ZP", 1, &output_ZP);
-            status |= vxSetNodeUniform(nodObj, "input_scale", 1, &scaleIn);
-            status |= vxSetNodeUniform(nodObj, "outputScale", 1, &reScaleOut_u8);
-            status |= vxSetNodeUniform(nodObj, "outputZP", 1, &reOutZP);
-            status |= vxSetNodeUniform(nodObj, "sumInZp", 1, &sumInZp);
-            status |= vxSetNodeUniform(nodObj, "tmpZp1", 1, &tmpZp1);
-            status |= vxSetNodeUniform(nodObj, "tmpZp2", 1, &tmpZp2);
-            status |= vxSetNodeUniform(nodObj, "e2InScale", 1, &e2InScale);
-        }
-        if(status < 0)
-        {
-            VSILOGE("[%s : %d]Initializer  failure! \n",__FILE__, __LINE__);
-        }
-    }
-    return status;
-}
-static vx_param_description_t vxLayerNormKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxLayerNormKernelInfo =
-{
-    VX_KERNEL_ENUM_LAYERNORM,
-    VX_KERNEL_NAME_LAYERNORM,
-    NULL,
-    vxLayerNormKernelParam,
-    (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxLayerNormInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxLayerNormKernelInfo_u8 =
-{
-    VX_KERNEL_ENUM_LAYERNORM,
-    VX_KERNEL_NAME_LAYERNORM_UINT8,
-    NULL,
-    vxLayerNormKernelParam,
-    (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxLayerNormInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxLayerNormKernelInfo_FP16toU8 =
-{
-    VX_KERNEL_ENUM_LAYERNORM_FP16TOU8,
-    VX_KERNEL_NAME_LAYERNORM_FP16TOU8,
-    NULL,
-    vxLayerNormKernelParam,
-    (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxLayerNormInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxLayerNormKernelInfo_U8toFP16 =
-{
-    VX_KERNEL_ENUM_LAYERNORM,
-    VX_KERNEL_NAME_LAYERNORM_U8TOFP16,
-    NULL,
-    vxLayerNormKernelParam,
-    (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxLayerNormInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxLayerNormKernelInfo_CPU =
-{
-    VX_KERNEL_ENUM_LAYERNORM,
-    VX_KERNEL_NAME_LAYERNORM,
-    vxLayerNormKernel,
-    vxLayerNormKernelParam,
-    (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_LAYERNORM_list[] =
-{
-    &vxLayerNormKernelInfo_CPU,
-    &vxLayerNormKernelInfo,
-    &vxLayerNormKernelInfo_u8,
-    &vxLayerNormKernelInfo_FP16toU8,
-    &vxLayerNormKernelInfo_U8toFP16,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c
deleted file mode 100644
index fa478d0..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_REDUCE)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_REDUCE)
-#define _VX_KERNEL_NAME         ("vsi_nn_kernel_reduce")
-#define _VX_KERNEL_FUNC_KERNEL  (vxReduceKernel)
-
-static vx_status VX_CALLBACK vxReduceKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    vx_uint32 paramNum
-    )
-{
-    /* TODO: */
-#define ARG_NUM            (6)
-#define TENSOR_NUM_INPUT (1)
-#define TENSOR_NUM_OUTPUT (1)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vx_status status = VX_SUCCESS;
-    vx_context context = NULL;
-    vsi_nn_tensor_attr_t attr[TENSOR_NUM];
-    vx_uint32 stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
-    vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
-    vx_uint8 *buffer_ptr[TENSOR_NUM] = {NULL};
-    vx_tensor tensor[TENSOR_NUM];
-
-    vx_float32 factor0;
-    vx_int32 factor;
-    vx_uint32 batch, c, h, w;
-    vx_uint32 i, j, k, b;
-
-    //prepare data
-    context = vxGetContext((vx_reference)node);
-
-    for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
-    }
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
-    }
-
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(factor0), VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    //op calc
-    if (factor0 > 1)
-    {
-        factor = (vx_int32)(factor0 + 0.5);
-        w = attr[0].size[0];
-        h = attr[0].size[1];
-        c = attr[0].size[2];
-        batch = 1;
-        for(b = 0; b < batch; ++b){
-            for(k = 0; k < c; ++k){
-                for(j = 0; j < h*factor; ++j){
-                    for(i = 0; i < w*factor; ++i){
-                        vx_int32 in_index = b*w*h*c + k*w*h + (j/factor)*w + i/factor;
-                        vx_int32 out_index = b*w*h*c*factor*factor + k*w*h*factor*factor +
-                            j*w*factor + i;
-                        vx_float32 fval;
-                        //out[out_index] = in[in_index];
-                        vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index],
-                            &fval, &attr[0].dtype);
-                        vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
-                            &attr[1].dtype);
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        factor = (vx_int32)(1 / factor0 + 0.5);
-        w = attr[1].size[0];
-        h = attr[1].size[1];
-        c = attr[1].size[2];
-        batch = 1;
-        for(b = 0; b < batch; ++b){
-            for(k = 0; k < c; ++k){
-                for(j = 0; j < h; ++j){
-                    for(i = 0; i < w; ++i){
-                        vx_int32 in_index = b*w*h*c*factor*factor +
-                            k*w*h*factor*factor + j*w*factor*factor + i*factor;
-                        vx_int32 out_index = b*w*h*c + k*w*h + j * w + i;
-                        vx_float32 fval;
-                        //out[out_index] = in[in_index];
-                        vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], &fval,
-                            &attr[0].dtype);
-                        vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
-                            &attr[1].dtype);
-                    }
-                }
-            }
-        }
-    }
-
-    //save data
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
-        if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
-    }
-    for( i = 0; i < TENSOR_NUM; i ++ )
-    {
-        if (buffer_ptr[i]) free(buffer_ptr[i]);
-    }
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_REDUCE_list[] =
-{
-    &_VX_KERNEL_VAR,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c
deleted file mode 100644
index ef9a073..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_RESIZE)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_RESIZE)
-#define _VX_KERNEL_NAME         ("vsi_nn_kernel_resize")
-#define _VX_KERNEL_FUNC_KERNEL  (vxResizeKernel)
-
-static vsi_status VX_CALLBACK vxResizeKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    /* TODO: */
-#define ARG_NUM            (1)
-#define TENSOR_NUM_INPUT (1)
-#define TENSOR_NUM_OUTPUT (1)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VX_SUCCESS;
-    vx_context context = NULL;
-    vsi_nn_tensor_attr_t attr[TENSOR_NUM];
-    uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
-    vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
-    uint8_t *buffer_ptr[TENSOR_NUM] = {NULL};
-    vx_tensor tensor[TENSOR_NUM];
-
-    float factor0;
-    int32_t factor;
-    uint32_t batch, c, h, w;
-    uint32_t i, j, k, b;
-
-    //prepare data
-    context = vxGetContext((vx_reference)node);
-
-    for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
-    }
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
-    }
-
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(factor0), VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    //op calc
-    if (factor0 > 1)
-    {
-        factor = (int32_t)(factor0 + 0.5);
-        w = attr[0].size[0];
-        h = attr[0].size[1];
-        c = attr[0].size[2];
-        batch = 1;
-        for(b = 0; b < batch; ++b){
-            for(k = 0; k < c; ++k){
-                for(j = 0; j < h*factor; ++j){
-                    for(i = 0; i < w*factor; ++i){
-                        int32_t in_index = b*w*h*c + k*w*h + (j/factor)*w + i/factor;
-                        int32_t out_index = b*w*h*c*factor*factor + k*w*h*factor*factor +
-                            j*w*factor + i;
-                        float fval;
-                        //out[out_index] = in[in_index];
-                        vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index],
-                            &fval, &attr[0].dtype);
-                        vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
-                            &attr[1].dtype);
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        factor = (int32_t)(1 / factor0 + 0.5);
-        w = attr[1].size[0];
-        h = attr[1].size[1];
-        c = attr[1].size[2];
-        batch = 1;
-        for(b = 0; b < batch; ++b){
-            for(k = 0; k < c; ++k){
-                for(j = 0; j < h; ++j){
-                    for(i = 0; i < w; ++i){
-                        int32_t in_index = b*w*h*c*factor*factor +
-                            k*w*h*factor*factor + j*w*factor*factor + i*factor;
-                        int32_t out_index = b*w*h*c + k*w*h + j * w + i;
-                        float fval;
-                        //out[out_index] = in[in_index];
-                        vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], &fval,
-                            &attr[0].dtype);
-                        vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
-                            &attr[1].dtype);
-                    }
-                }
-            }
-        }
-    }
-
-    //save data
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
-        if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
-    }
-    for( i = 0; i < TENSOR_NUM; i ++ )
-    {
-        if (buffer_ptr[i]) free(buffer_ptr[i]);
-    }
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-};
-
-vsi_status VX_CALLBACK vxTensorResizeInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        2,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-    uint32_t uniPackEvenData_2x8[16] = {
-        0x33333333, // TCfg
-        0x11110000, // ASelt
-        0x06040200, 0x06040200, // ABin
-        0x00000000, // BSelt
-        0x00000000, 0x00000000, // BBin
-        0x00003400, // AccumType, ConstantType, and PostShift
-        0x00000000, 0x00000000, 0x00000000, 0x00000000,
-        0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-    };
-    vsi_status status = VX_SUCCESS;
-
-    vx_tensor input = (vx_tensor)paramObj[0];
-    uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1};
-    vsi_nn_tensor_attr_t attr;
-    uint32_t i, input_dim;
-
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status  = vsi_nn_vxGetTensorAttr(input, &attr);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-        return status;
-    }
-    input_dim  = attr.dim_num;
-    for (i = 0; i < input_dim; i++)
-    {
-        input_size[i] = attr.size[i];
-    }
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkScale[0]  = 16;
-    shaderParam.globalWorkScale[1]  = 2;
-    shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-        / shaderParam.globalWorkScale[0], 4);
-    shaderParam.globalWorkSize[1]   = (input_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1];
-
-    vxSetNodeUniform(nodObj, "uniPackEvenData_2x8", 1, uniPackEvenData_2x8);
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    return VX_SUCCESS;
-}
-
-static vx_param_description_t vxTensorResizeKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTensorResize16BitsDownSampleQuarterKernelInfo =
-{
-    VX_KERNEL_ENUM_RESIZE_16BITS_DOWNSAMPLE_QUARTER,
-    VX_KERNEL_NAME_RESIZE_16BITS_DOWNSAMPLE_QUARTER,
-    NULL,
-    vxTensorResizeKernelParam,
-    (sizeof(vxTensorResizeKernelParam) / sizeof(vxTensorResizeKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxTensorResizeInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTensorResize8BitsDownSampleQuarterKernelInfo =
-{
-    VX_KERNEL_ENUM_RESIZE_8BITS_DOWNSAMPLE_QUARTER,
-    VX_KERNEL_NAME_RESIZE_8BITS_DOWNSAMPLE_QUARTER,
-    NULL,
-    vxTensorResizeKernelParam,
-    (sizeof(vxTensorResizeKernelParam) / sizeof(vxTensorResizeKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxTensorResizeInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_RESIZE_list[] =
-{
-    &_VX_KERNEL_VAR,
-    &vxTensorResize16BitsDownSampleQuarterKernelInfo,
-    &vxTensorResize8BitsDownSampleQuarterKernelInfo,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c
deleted file mode 100644
index 0287f19..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_ROI_ALIGN)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_ROI_ALIGN)
-#define _VX_KERNEL_NAME         (VX_KERNEL_NAME_ROI_ALIGN)
-#define _VX_KERNEL_FUNC_KERNEL  (vxRoi_alignKernel)
-
-static vsi_status VX_CALLBACK vxRoi_alignKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-#define ARG_NUM            (6)
-#define TENSOR_NUM_INPUT (3)
-#define TENSOR_NUM_OUTPUT (1)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VSI_FAILURE;
-    vx_context context = NULL;
-    vx_tensor input[TENSOR_NUM_INPUT] = {0};
-    vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
-    float *f32_in_buffer[TENSOR_NUM_INPUT] = {0};
-    int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0};
-    float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
-    vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT];
-    vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
-    uint32_t in_elements[TENSOR_NUM_INPUT] = {0};
-    uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
-
-    int32_t output_height;
-    int32_t output_width;
-    float height_ratio;
-    float width_ratio;
-    int32_t height_sample_num;
-    int32_t width_sample_num;
-
-    uint32_t i = 0;
-    for(i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    /* prepare data */
-    context = vxGetContext((vx_reference)node);
-
-    for(i = 0; i < TENSOR_NUM_INPUT; i ++)
-    {
-        input[i] = (vx_tensor)paramObj[i];
-        status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]);
-        if (i == 2)
-        {
-            int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context,
-                input[i], &in_attr[i]);
-        }
-        else
-        {
-            f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float));
-            status = vsi_nn_vxConvertTensorToFloat32Data(
-                context, input[i], &in_attr[i], f32_in_buffer[i],
-                in_elements[i] * sizeof(float));
-            TEST_CHECK_STATUS(status, final);
-        }
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i ++)
-    {
-        output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
-        status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
-        f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float));
-        memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float));
-    }
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(output_height),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(output_width),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(height_ratio),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(width_ratio),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(height_sample_num),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(width_sample_num),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    /* TODO: Add CPU kernel implement */
-    {
-        uint32_t n, j, k;
-        uint32_t kRoiDim = 4;
-        float heightScale = 1.0f / height_ratio;
-        float widthScale = 1.0f / width_ratio;
-        uint32_t inHeight = in_attr[0].size[2];
-        uint32_t inWidth = in_attr[0].size[1];
-        uint32_t inDepth = in_attr[0].size[0];
-        uint32_t numRois = in_attr[1].size[1];
-        uint32_t outHeight = out_attr[0].size[2];
-        uint32_t outWidth = out_attr[0].size[1];
-        uint32_t out_index = 0;
-
-        for(n = 0; n < numRois; n++)
-        {
-            uint32_t batchId = int32_in_buffer[2][n];
-            float scale = (in_attr[1].dtype.vx_type == VSI_NN_TYPE_UINT16) ? 0.125f : 1.0f;
-            float wRoiStart = f32_in_buffer[1][n * kRoiDim] * widthScale * scale;
-            float hRoiStart = f32_in_buffer[1][n * kRoiDim + 1] * heightScale * scale;
-            float wRoiEnd = f32_in_buffer[1][n * kRoiDim + 2] * widthScale * scale;
-            float hRoiEnd = f32_in_buffer[1][n * kRoiDim + 3] * heightScale * scale;
-
-            float roiWidth = vsi_nn_max((wRoiEnd - wRoiStart), 1.0f);
-            float roiHeight = vsi_nn_max((hRoiEnd - hRoiStart), 1.0f);
-            float wStepSize = roiWidth / outWidth;
-            float hStepSize = roiHeight / outHeight;
-
-            uint32_t wSamplingRatio = width_sample_num > 0
-                ? width_sample_num : (uint32_t)ceil(wStepSize);
-            uint32_t hSamplingRatio = height_sample_num > 0
-                ? height_sample_num : (uint32_t)ceil(hStepSize);
-            int32_t numSamplingPoints = wSamplingRatio * hSamplingRatio;
-            float wBinSize = wStepSize / (float)(wSamplingRatio);
-            float hBinSize = hStepSize / (float)(hSamplingRatio);
-
-            int32_t batch_base_index = batchId * inHeight * inWidth * inDepth;
-
-            for (i = 0; i < outHeight; i++)
-            {
-                for (j = 0; j < outWidth; j++)
-                {
-                    float wStart = wStepSize * j + wRoiStart;
-                    float wEnd = wStepSize * (j + 1) + wRoiStart;
-                    float hStart = hStepSize * i + hRoiStart;
-                    float hEnd = hStepSize * (i + 1) + hRoiStart;
-
-                    float x,y;
-                    for (y = hStart + hBinSize / 2; y < hEnd; y += hBinSize)
-                    {
-                        for (x = wStart + wBinSize / 2; x < wEnd; x += wBinSize)
-                        {
-                            uint32_t x1 = (uint32_t)floor(x);
-                            uint32_t y1 = (uint32_t)floor(y);
-                            uint32_t x2 = x1 + 1, y2 = y1 + 1;
-                            float dx1 = x - (float)(x1);
-                            float dy1 = y - (float)(y1);
-                            if (x1 >= inWidth - 1) {
-                                x1 = x2 = inWidth - 1;
-                                dx1 = 0;
-                            }
-                            if (y1 >= inHeight - 1) {
-                                y1 = y2 = inHeight - 1;
-                                dy1 = 0;
-                            }
-                            {
-                                float dx2 = 1.0f - dx1, dy2 = 1.0f - dy1;
-                                float ws[] = {dx2 * dy2, dx1 * dy2,
-                                    dx2 * dy1, dx1 * dy1};
-                                uint32_t offsets[] = {y1 * inWidth * inDepth + x1 * inDepth,
-                                    y1 * inWidth * inDepth + x2 * inDepth,
-                                    y2 * inWidth * inDepth + x1 * inDepth,
-                                    y2 * inWidth * inDepth + x2 * inDepth};
-                                for (k = 0; k < inDepth; k++) {
-                                    float interpolation = 0;
-                                    uint32_t c;
-                                    for (c = 0; c < 4; c++)
-                                    {
-                                        interpolation += ws[c]
-                                        * f32_in_buffer[0][batch_base_index + offsets[c] + k];
-                                    }
-                                    f32_out_buffer[0][out_index + k] += interpolation;
-                                }
-                            }
-                        }
-                    }
-                    for (k = 0; k < inDepth; k++)
-                    {
-                        f32_out_buffer[0][out_index + k] /= (float)(numSamplingPoints);
-                    }
-                    out_index += inDepth;
-                }
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        status = vsi_nn_vxConvertFloat32DataToTensor(
-            context, output[i], &out_attr[i], f32_out_buffer[i],
-            out_elements[i] * sizeof(float));
-        TEST_CHECK_STATUS(status, final);
-    }
-
-final:
-    for (i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        if (f32_in_buffer[i]) free(f32_in_buffer[i]);
-        if (int32_in_buffer[i]) free(int32_in_buffer[i]);
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        if (f32_out_buffer[i]) free(f32_out_buffer[i]);
-    }
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t vxRoi_alignKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-vx_status VX_CALLBACK vxRoi_alignInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-    vx_status status = VX_SUCCESS;
-    /*TODO: Add initial code for VX program*/
-
-    return status;
-}
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxRoi_align_CPU =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    vxRoi_alignKernelParam,
-    _cnt_of_array( vxRoi_alignKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxRoi_align_VX =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    NULL,
-    vxRoi_alignKernelParam,
-    _cnt_of_array( vxRoi_alignKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxRoi_alignInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_ROI_ALIGN_list[] =
-{
-    &vxRoi_align_CPU,
-    &vxRoi_align_VX,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c
deleted file mode 100644
index d97517e..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c
+++ /dev/null
@@ -1,410 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_SCALE)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_SCALE)
-#define _VX_KERNEL_NAME         ("vsi_nn_kernel_scale")
-#define _VX_KERNEL_FUNC_KERNEL  (vxScaleKernel)
-
-static vsi_status VX_CALLBACK vxScaleKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if( 6 == paramNum )
-    {
-        vx_context  context = NULL;
-        vx_tensor   input_tensor = NULL;
-        vx_tensor   scale_tensor = NULL;
-        vx_tensor   bias_tensor = NULL;
-        vx_tensor   output_tensor = NULL;
-        uint8_t *  input_buffer = NULL;
-        uint8_t *  scale_buffer = NULL;
-        uint8_t *  bias_buffer = NULL;
-        uint8_t *  output_buffer = NULL;
-        vx_scalar   axis_scalar = NULL;
-        vx_scalar   has_bias_scalar = NULL;
-        int         axis = 1;
-        float  has_bias = 0;
-        uint32_t   input_dims = 0;
-        uint32_t   scale_dims = 0;
-        uint32_t   bias_dims = 0;
-        uint32_t   output_dims = 0;
-        vsi_enum     inputFormat = VSI_NN_TYPE_FLOAT16;
-        vsi_enum     scaleFormat = VSI_NN_TYPE_FLOAT16;
-        vsi_enum     biasFormat = VSI_NN_TYPE_FLOAT32;
-        vsi_enum     outputFormat = VSI_NN_TYPE_FLOAT16;
-        uint32_t   input_size[4] = {1, 1, 1, 1};
-        uint32_t   scale_size[4] = {1, 1, 1, 1};
-        uint32_t   bias_size[4] = {1, 1, 1, 1};
-        uint32_t   output_size[4] = {1, 1, 1, 1};
-        uint32_t   input_stride_size[VSI_NN_MAX_DIM_NUM] = { 0 };
-        uint32_t   output_stride_size[VSI_NN_MAX_DIM_NUM] = { 0 };
-        vx_tensor_addressing input_user_addr = NULL;
-        vx_tensor_addressing scale_user_addr = NULL;
-        vx_tensor_addressing bias_user_addr = NULL;
-        vx_tensor_addressing output_user_addr = NULL;
-        vsi_nn_tensor_attr_t out_attr;
-
-        status = VX_SUCCESS;
-
-        memset(&out_attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
-
-        input_tensor = (vx_tensor)paramObj[0];
-        scale_tensor = (vx_tensor)paramObj[1];
-        bias_tensor = (vx_tensor)paramObj[2];
-        output_tensor = (vx_tensor)paramObj[3];
-        axis_scalar = (vx_scalar)paramObj[4];
-        has_bias_scalar = (vx_scalar)paramObj[5];
-
-        context = vxGetContext((vx_reference)node);
-        if( NULL == context)
-        {
-            VSILOGE("vxGetContext failure!\n");
-            status = VX_FAILURE;
-            goto OnError;
-        }
-
-        input_buffer = vsi_nn_ConvertRawTensorToData(context, input_tensor,
-            &input_dims, &inputFormat, input_size, input_stride_size,
-            &input_user_addr, VX_READ_ONLY);
-        if( NULL == input_buffer )
-        {
-            VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
-            status = VX_ERROR_NO_MEMORY;
-            goto OnError;
-        }
-
-        scale_buffer = vsi_nn_ConvertRawTensorToData(context, scale_tensor,
-            &scale_dims, &scaleFormat, scale_size, input_stride_size,
-            &scale_user_addr, VX_READ_ONLY);
-        if( NULL == scale_buffer )
-        {
-            VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
-            status = VX_ERROR_NO_MEMORY;
-            goto OnError;
-        }
-
-        bias_buffer = vsi_nn_ConvertRawTensorToData(context, bias_tensor,
-            &bias_dims, &biasFormat, bias_size, input_stride_size,
-            &bias_user_addr, VX_READ_ONLY);
-        if( NULL == bias_buffer )
-        {
-            VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
-            status = VX_ERROR_NO_MEMORY;
-            goto OnError;
-        }
-
-        output_buffer = vsi_nn_ConvertRawTensorToData(context, output_tensor,
-            &output_dims, &outputFormat, output_size, output_stride_size,
-            &output_user_addr, VX_WRITE_ONLY);
-        if( NULL == output_buffer )
-        {
-            VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
-            status = VX_ERROR_NO_MEMORY;
-            goto OnError;
-        }
-
-        status = vsi_nn_vxGetTensorAttr(output_tensor, &out_attr);
-        if (status != VX_SUCCESS)
-        {
-            VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
-            goto OnError;
-        }
-
-        status = vxCopyScalar(axis_scalar, &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        if( VX_SUCCESS != status)
-        {
-            VSILOGE("vxCopyScalar axis failure! status:%d\n", status);
-            goto OnError;
-        }
-        status = vxCopyScalar(has_bias_scalar, &has_bias, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        if( VX_SUCCESS != status )
-        {
-            VSILOGE("vxCopyScalar axis failure! has_bias:%f\n", has_bias);
-            goto OnError;
-        }
-
-        if( input_dims != output_dims )
-        {
-            VSILOGE("Invalid parameters, input_dims output_dims mismatch %d:%d\n",
-                input_dims, output_dims);
-            status = VX_ERROR_INVALID_PARAMETERS;
-            goto OnError;
-        }
-        if( input_size[0] != scale_size[0] || input_size[0] != bias_size[0] )
-        {
-            VSILOGE("Invalid parameters, input size mismatch %d:%d:%d\n",
-                input_size[0], scale_size[0], bias_size[0]);
-            status = VX_ERROR_INVALID_PARAMETERS;
-            goto OnError;
-        }
-        {
-            uint32_t i = 0;
-            uint32_t j = 0;
-            uint32_t fixed_num = 1;
-            uint32_t changed_num = 1;
-
-            fixed_num = input_size[1] * input_size[2] * input_size[3];
-            changed_num = input_size[0];
-
-            for( i = 0; i < fixed_num; i++ )
-            {
-                int16_t* cur_input_row_ofst = ((int16_t *)input_buffer) + i * changed_num;
-                int16_t* cur_scale_row_ofst = ((int16_t *)scale_buffer);
-                float* cur_bias_row_ofst = ((float *)bias_buffer);
-                int16_t* cur_output_row_ofst = ((int16_t *)output_buffer) + i * changed_num;
-
-                for( j = 0; j < changed_num; j++ )
-                {
-                    float cur_input_v = vsi_nn_Fp16ToFp32(*(cur_input_row_ofst + j));
-                    float cur_scale_v = vsi_nn_Fp16ToFp32(*(cur_scale_row_ofst + j));
-                    float cur_bias_v = *(cur_bias_row_ofst + j);
-
-                    float cur_result = cur_input_v * cur_scale_v + cur_bias_v;
-                    *(cur_output_row_ofst + j) = vsi_nn_Fp32ToFp16(cur_result);
-                }
-            }
-
-#if defined(_SAVE_TENSOR)
-            {
-                static int count = 0;
-                char fname[256] = { 0 };
-                sprintf(fname, "scale_output_tensor.%d.axis.%d.txt", count, axis);
-                vsi_nn_SaveDataToText(fname, output_buffer,
-                    vsi_nn_ShapeProduct(output_size, output_dims), VSI_NN_TYPE_FLOAT16, NULL);
-                count++;
-            }
-#endif
-        }
-        status = vsi_nn_vxCopyDataToTensor(context, output_tensor, &out_attr, output_buffer);
-        TEST_CHECK_STATUS(status, OnError);
-OnError:
-        if( NULL != input_buffer )
-        {
-            free( input_buffer );
-            input_buffer = NULL;
-        }
-        if( NULL != scale_buffer )
-        {
-            free( scale_buffer );
-            scale_buffer = NULL;
-        }
-        if( NULL != bias_buffer )
-        {
-            free( bias_buffer );
-            bias_buffer = NULL;
-        }
-        if( NULL != output_buffer )
-        {
-            free( output_buffer );
-            output_buffer = NULL;
-        }
-
-        if (input_user_addr)
-        {
-            vxReleaseTensorAddressing(&input_user_addr);
-        }
-        if (scale_user_addr)
-        {
-            vxReleaseTensorAddressing(&scale_user_addr);
-        }
-        if (bias_user_addr)
-        {
-            vxReleaseTensorAddressing(&bias_user_addr);
-        }
-        if (output_user_addr)
-        {
-            vxReleaseTensorAddressing(&output_user_addr);
-        }
-
-    }
-
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-};
-
-vsi_status VX_CALLBACK vxScaleInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        2,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-    uint32_t uniExtractHalf8_2x8[16] = {
-        0x11111111, // TCfg
-        0x11110000, // ASelt
-        0x06040200, 0x06040200, // ABin
-        0x22222222, // BSelt
-        0x00000000, 0x00000000, // BBin
-        0x00002100, // AccumType, ConstantType, and PostShift
-        0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
-        0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
-    };
-    uint32_t uniFp16MulFp16ToFp32_Lo_4x4[16] = {
-        0x01010101, // TCfg
-        0x00000000, // ASelt
-        0x00010000, 0x00030002, // ABin
-        0x01010101, // BSelt
-        0x00010000, 0x00030002, // BBin
-        0x00000400, // AccumType, ConstantType, and PostShift
-        0x00000000, 0x00000000, 0x00000000, 0x00000000,
-        0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-    };
-    uint32_t uniFp16MulFp16ToFp32_Hi_4x4[16] = {
-        0x01010101, // TCfg
-        0x00000000, // ASelt
-        0x00050004, 0x00070006, // ABin
-        0x01010101, // BSelt
-        0x00050004, 0x00070006, // BBin
-        0x00000400, // AccumType, ConstantType, and PostShift
-        0x00000000, 0x00000000, 0x00000000, 0x00000000,
-        0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-    };
-
-    vsi_status status = VX_SUCCESS;
-
-    vx_tensor input     = (vx_tensor)paramObj[0];
-    uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1};
-    vx_uint32 i = 0;
-    vsi_nn_tensor_attr_t attr;
-
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status = vsi_nn_vxGetTensorAttr(input, &attr);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-    for (i = 0; i < attr.dim_num; i++)
-    {
-        input_size[i] = attr.size[i];
-    }
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkScale[0]  = 8;
-    shaderParam.globalWorkScale[1]  = 1;
-    shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-        / shaderParam.globalWorkScale[0], 4);
-    shaderParam.globalWorkSize[1]   = (input_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1];
-
-    vxSetNodeUniform(nodObj, "uniExtractHalf8_2x8", 1, uniExtractHalf8_2x8);
-    vxSetNodeUniform(nodObj, "uniFp16MulFp16ToFp32_Lo_4x4", 1, uniFp16MulFp16ToFp32_Lo_4x4);
-    vxSetNodeUniform(nodObj, "uniFp16MulFp16ToFp32_Hi_4x4", 1, uniFp16MulFp16ToFp32_Hi_4x4);
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    return VX_SUCCESS;
-}
-
-static vx_param_description_t vxScaleKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxScaleKernelInfo =
-{
-    VX_KERNEL_ENUM_SCALE,
-    VX_KERNEL_NAME_SCALE_FP16,
-    NULL,
-    vxScaleKernelParam,
-    (sizeof(vxScaleKernelParam) / sizeof(vxScaleKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxScaleInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_SCALE_list[] =
-{
-    &_VX_KERNEL_VAR,
-    &vxScaleKernelInfo,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
-
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c
deleted file mode 100644
index acdc249..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-vsi_status vxShuffleChannelFunc
-    (
-    vx_context context,
-    vx_tensor input,
-    vx_tensor output,
-    int32_t group_number,
-    int32_t axis
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_tensor_attr_t input_attr;
-    vsi_nn_tensor_attr_t output_attr;
-    uint8_t *in_data = NULL;
-    uint8_t *out_data = NULL;
-    uint32_t stride_size[VSI_NN_MAX_DIM_NUM] = {0};
-    uint32_t buf_sz = 0;
-    uint32_t group_row = group_number;
-    uint32_t chs = 0, group_col = 0;
-    uint32_t len = 1, num = 1, feature_map_size = 1;
-    uint32_t n = 0, i = 0, j = 0;
-    uint32_t type_bytes = 0, len_bytes = 0, fms_bytes = 0;
-
-    status  = vsi_nn_vxGetTensorAttr(input, &input_attr);
-    status |= vsi_nn_vxGetTensorAttr(output, &output_attr);
-    TEST_CHECK_STATUS(status, final);
-    in_data = vsi_nn_vxCopyTensorToData(context, input, &input_attr);
-    TEST_CHECK_PTR(in_data, final);
-    buf_sz = vsi_nn_GetStrideSize(&output_attr, stride_size);
-    out_data = (uint8_t *)malloc( buf_sz );
-    TEST_CHECK_PTR(out_data, final);
-
-    chs = input_attr.size[axis];
-    group_col = chs / group_row;
-    type_bytes = vsi_nn_TypeGetBytes( input_attr.dtype.vx_type );
-
-    for ( i = 0; i < (uint32_t)axis; i++)
-    {
-        len *= input_attr.size[i];
-    }
-    for ( i = axis + 1; i < input_attr.dim_num; i++)
-    {
-        num *= input_attr.size[i];
-    }
-    for ( i = 0; i <= (uint32_t)axis; i++)
-    {
-        feature_map_size *= input_attr.size[i];
-    }
-
-    /* Shuffle Channel CPU Implement, the shape and dtype of output must same as input */
-    len_bytes = len * type_bytes;
-    fms_bytes = feature_map_size * type_bytes;
-    for ( n = 0; n < num; n++)
-    {
-        for ( i = 0; i < group_row; i++)
-        {
-            for ( j = 0; j < group_col; j++)
-            {
-                uint8_t *in_ptr = in_data + n * fms_bytes + (i * group_col + j) * len_bytes;
-                uint8_t *out_ptr = out_data + n * fms_bytes + (j * group_row + i) * len_bytes;
-
-                memcpy(out_ptr, in_ptr, len_bytes);
-            }
-        }
-    }
-
-    /* Copy data to output tensor */
-    status = vsi_nn_vxCopyDataToTensor(context, output, &output_attr, out_data);
-    TEST_CHECK_STATUS(status, final);
-final:
-    if (in_data) free(in_data);
-    if (out_data) free(out_data);
-    return status;
-}
-vsi_status VX_CALLBACK vxShuffleChannelKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_ERROR_INVALID_PARAMETERS;
-
-    if(paramNum == 4)
-    {
-        vx_context context = NULL;
-        // tensor
-        vx_tensor imgObj[2] = { NULL };
-        // scalar
-        vx_scalar scalar[2] = { NULL };
-        int32_t group_number = 0;
-        int32_t axis = 0;
-
-        imgObj[0] = (vx_tensor)paramObj[0];
-        imgObj[1] = (vx_tensor)paramObj[1];
-        scalar[0] = (vx_scalar)paramObj[2];
-        scalar[1] = (vx_scalar)paramObj[3];
-
-        context = vxGetContext((vx_reference)node);
-        TEST_CHECK_PTR(context,final);
-        // scalar
-        status = vxCopyScalar(scalar[0], &group_number, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        TEST_CHECK_STATUS(status, final);
-        status = vxCopyScalar(scalar[1], &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-        TEST_CHECK_STATUS(status, final);
-
-        // Call C Prototype
-        status = vxShuffleChannelFunc(context, imgObj[0], imgObj[1], group_number, axis);
-        TEST_CHECK_STATUS(status, final);
-    }
-final:
-    return status;
-}
-vsi_status VX_CALLBACK vxShuffleChannelInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        3,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-
-    vx_tensor     input           = (vx_tensor)paramObj[0];
-    vx_scalar     group_numbers   = (vx_scalar)paramObj[2];
-    vx_scalar     axis_s          = (vx_scalar)paramObj[3];
-    uint32_t      input_size[4]   = {1, 1, 1, 1};
-    vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16;
-    int32_t       group_number    = 0;
-    int32_t       axis            = 0;
-    int32_t       group_column    = 0;
-    float         rgroup_column   = 0.0f;
-    uint32_t      chs             = 0;
-    vx_uint32     i               = 0;
-    vsi_nn_tensor_attr_t attr;
-
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status = vsi_nn_vxGetTensorAttr(input, &attr);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-    for (i = 0; i < attr.dim_num; i++)
-    {
-        input_size[i] = attr.size[i];
-    }
-    inputDataFormat = attr.dtype.vx_type;
-
-    status |= vxCopyScalar(group_numbers, &group_number, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    status |= vxCopyScalar(axis_s, &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    if(VX_SUCCESS != status)
-    {
-        VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
-        return status;
-    }
-    chs = input_size[axis];
-    if (chs % group_number)
-    {
-        VSILOGE("input channel can't be exact divided by group number! at line %d\n", __LINE__);
-        return VX_FAILURE;
-    }
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkOffset[2] = 0;
-    if (axis == 2)
-    {
-        if (inputDataFormat == VSI_NN_TYPE_FLOAT16 || inputDataFormat == VSI_NN_TYPE_INT16)
-            shaderParam.globalWorkScale[0]  = 8;
-        else
-            shaderParam.globalWorkScale[0]  = 16;
-        shaderParam.globalWorkScale[1]  = 4;
-        shaderParam.globalWorkScale[2]  = 1;
-
-        shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-            / shaderParam.globalWorkScale[0], 4);
-        shaderParam.globalWorkSize[1]   = (input_size[1] + shaderParam.globalWorkScale[1] - 1)
-            / shaderParam.globalWorkScale[1];
-        shaderParam.globalWorkSize[2]   = input_size[2];
-    }
-    else if (axis == 1)
-    {
-        shaderParam.globalWorkScale[0]  = 32;
-        shaderParam.globalWorkScale[1]  = 1;
-        shaderParam.globalWorkScale[2]  = 1;
-
-        shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-            / shaderParam.globalWorkScale[0], 4);
-        shaderParam.globalWorkSize[1]   = input_size[1];
-        shaderParam.globalWorkSize[2]   = input_size[2];
-    }
-    else
-    {
-        VSILOGE("[%s : %d]Initializer failure, not support axis: %d! \n",__FILE__, __LINE__, axis);
-        return VX_FAILURE;
-    }
-    group_column = chs / group_number;
-    rgroup_column = 1.0f / group_column;
-
-    status |= vxSetNodeUniform(nodObj, "group_column", 1, &group_column);
-    status |= vxSetNodeUniform(nodObj, "rgroup_column", 1, &rgroup_column);
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    if(status < 0)
-    {
-        VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
-    }
-    return status;
-}
-static vx_param_description_t vxShuffleChannelKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxShuffleChannelKernelInfo =
-{
-    VX_KERNEL_ENUM_SHUFFLECHANNEL,
-    VX_KERNEL_NAME_SHUFFLECHANNEL,
-    NULL,
-    vxShuffleChannelKernelParam,
-    (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxShuffleChannelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-vx_kernel_description_t vxShuffleChannelKernelInfo8Bits =
-{
-    VX_KERNEL_ENUM_SHUFFLECHANNEL,
-    VX_KERNEL_NAME_SHUFFLECHANNEL8BITS,
-    NULL,
-    vxShuffleChannelKernelParam,
-    (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxShuffleChannelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-vx_kernel_description_t vxShuffleChannelKernelInfo_CPU =
-{
-    VX_KERNEL_ENUM_SHUFFLECHANNEL,
-    VX_KERNEL_NAME_SHUFFLECHANNEL,
-    vxShuffleChannelKernel,
-    vxShuffleChannelKernelParam,
-    (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-vx_kernel_description_t vxShuffleChannelKernelInfo_16BitsAxis1 =
-{
-    VX_KERNEL_ENUM_SHUFFLECHANNEL,
-    VX_KERNEL_NAME_SHUFFLECHANNEL16BITS_AXIS1,
-    NULL,
-    vxShuffleChannelKernelParam,
-    (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxShuffleChannelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-vx_kernel_description_t vxShuffleChannelKernelInfo_8BitsAxis1 =
-{
-    VX_KERNEL_ENUM_SHUFFLECHANNEL,
-    VX_KERNEL_NAME_SHUFFLECHANNEL8BITS_AXIS1,
-    NULL,
-    vxShuffleChannelKernelParam,
-    (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxShuffleChannelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-vx_kernel_description_t * vx_kernel_SHUFFLECHANNEL_list[] =
-{
-    &vxShuffleChannelKernelInfo_CPU,
-    &vxShuffleChannelKernelInfo,
-    &vxShuffleChannelKernelInfo8Bits,
-    &vxShuffleChannelKernelInfo_16BitsAxis1,
-    &vxShuffleChannelKernelInfo_8BitsAxis1,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c
deleted file mode 100644
index 67308f8..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c
+++ /dev/null
@@ -1,293 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_SPACE2DEPTH)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_SPACE2DEPTH)
-#define _VX_KERNEL_NAME         ("vsi_nn_kernel_space2depth")
-#define _VX_KERNEL_FUNC_KERNEL  (vxSpace2DepthKernel)
-
-static vsi_status VX_CALLBACK vxSpace2DepthKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    /* TODO: */
-#define ARG_NUM          (2)
-#define TENSOR_NUM_INPUT (1)
-#define TENSOR_NUM_OUTPUT (1)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VX_SUCCESS;
-    uint32_t  i = 0;
-    vx_context context = NULL;
-    vsi_nn_tensor_attr_t attr[TENSOR_NUM];
-    uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
-    vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
-    uint8_t *buffer_ptr[TENSOR_NUM] = {NULL};
-    vx_tensor tensor[TENSOR_NUM] = {NULL};
-
-    int32_t block_size_x = 0, block_size_y = 0;
-    int32_t output_depth = 0, output_height = 0, output_width = 0;
-    int32_t input_batch = 0, input_depth = 0, input_height = 0, input_width = 0;
-    int32_t batch = 0, dim = 0;
-
-    for(i = 0; i < TENSOR_NUM; i++)
-    {
-        memset(&attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-
-    //prepare data
-    context = vxGetContext((vx_reference)node);
-
-    for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
-    }
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        tensor[i] = (vx_tensor)paramObj[i];
-        buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
-            &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
-    }
-
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(block_size_x),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(block_size_y),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    dim = attr[0].dim_num;
-    if(dim < 4)
-        attr[0].size[3] = 1;
-    //op calc
-    //output_batch = attr[1].size[3];
-    output_depth = attr[1].size[2];
-    output_height = attr[1].size[1];
-    output_width = attr[1].size[0];
-
-    input_batch = attr[0].size[3];
-    input_depth = attr[0].size[2];
-    input_height = attr[0].size[1];
-    input_width = attr[0].size[0];
-
-    for (batch = 0; batch < input_batch; ++batch)
-    {
-        vx_uint32 output_batch_index = batch * output_height * output_width * output_depth;
-        vx_uint32 input_batch_index = batch * input_height * input_width * input_depth;
-        vx_uint32 in_d;
-        for (in_d = 0; in_d < (vx_uint32)input_depth; in_d ++)
-        {
-            vx_uint32 in_h;
-            for (in_h = 0; in_h < (vx_uint32)input_height; ++ in_h)
-            {
-                vx_uint32 in_w;
-                for (in_w = 0; in_w < (vx_uint32)input_width; in_w ++)
-                {
-                    vx_int32 out_w = in_w / block_size_x;
-                    vx_int32 out_h = in_h / block_size_y;
-                    //vx_int32 out_d = (in_w  % block_size_x) * input_depth + (in_h % block_size_y) * block_size_x * input_depth + in_d;
-                    vx_int32 out_d = (in_w  % block_size_x) + (in_h % block_size_y) * block_size_x + in_d * block_size_x * block_size_y;
-
-                    vx_int32 in_index = in_w + in_h * input_width +in_d * input_height * input_width + input_batch_index;
-                    vx_int32 out_index = out_w + out_h * output_width +  out_d * output_width * output_height + output_batch_index;
-
-                    //outputBase[out_index] = inputBase[in_index];
-                    float fval;
-                    vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index],
-                        &fval, &attr[0].dtype);
-                    vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
-                        &attr[1].dtype);
-                }
-            }
-        }
-    }
-
-    //save data
-    for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
-    {
-        vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
-    }
-    for( i = 0; i < TENSOR_NUM; i ++ )
-    {
-        if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
-        if (buffer_ptr[i]) free(buffer_ptr[i]);
-    }
-
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-vsi_status VX_CALLBACK vxSpace2DepthInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    uint32_t paraNum
-    )
-{
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        3,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-
-    vsi_status status = VX_SUCCESS;
-
-    vx_tensor input     = (vx_tensor)paramObj[0];
-    uint32_t input_size[4] = {1, 1, 1, 1};
-    vx_uint32 input_dimz = 0;
-    vx_uint32 input_depth = 0;
-    vx_uint32 i = 0;
-    vsi_nn_tensor_attr_t attr;
-
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status = vsi_nn_vxGetTensorAttr(input, &attr);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-    for (i = 0; i < attr.dim_num; i++)
-    {
-        input_size[i] = attr.size[i];
-    }
-
-    input_depth = input_size[2];
-    if(input_size[3] > 0)
-        input_dimz = input_depth * input_size[3];
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkOffset[2] = 0;
-    shaderParam.globalWorkScale[0]  = 8;
-    shaderParam.globalWorkScale[1]  = 1;
-    shaderParam.globalWorkScale[2]  = 1;
-    shaderParam.localWorkSize[0]    = 8;
-    shaderParam.localWorkSize[1]    = 1;
-    shaderParam.localWorkSize[2]    = 1;
-    shaderParam.globalWorkSize[0]   = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
-        / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
-    shaderParam.globalWorkSize[1]   = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1)
-        / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
-    shaderParam.globalWorkSize[2]   = input_dimz;
-
-    {
-        vx_uint32 uniExtractEvenFp16Stride2_4x4[16] = {
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00020000, 0x00060004, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        };
-        vx_uint32 uniExtractOddFp16Stride2_4x4[16] = {
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00030001, 0x00070005, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        };
-        status |= vxSetNodeUniform(nodObj, "uniExtractEvenFp16Stride2_4x4", 1, uniExtractEvenFp16Stride2_4x4);
-        status |= vxSetNodeUniform(nodObj, "uniExtractOddFp16Stride2_4x4", 1, uniExtractOddFp16Stride2_4x4);
-        //status |= vxSetNodeUniform(nodObj, "input_depth", 1, &input_depth);
-    }
-
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    return VX_SUCCESS;
-}
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxSpace2DepthKernelInfo_int16_int16 =
-{
-    _VX_KERNEL_ID,
-    VX_KERNEL_NAME_SPACE2DEPTH_INT16_INT16,
-    NULL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxSpace2DepthInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_SPACE2DEPTH_list[] =
-{
-    NULL,
-    &_VX_KERNEL_VAR,
-    &vxSpace2DepthKernelInfo_int16_int16,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx b/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx
index f19c623..5130391 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx
@@ -54,3 +54,81 @@ __kernel void a_times_b_plus_c_F16_F16_F16toF16_2D
     VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }
 
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;
+_viv_uniform VXC_512Bits uniA_Times_B_lo_4x4;
+_viv_uniform VXC_512Bits uniA_Times_B_hi_4x4;
+__kernel void a_times_b_plus_c_F16_F16_F32toF16
+    (
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __read_only image2d_array_t   input2,
+    __write_only image2d_array_t  output
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_half8   src0, src1, dst;
+    vxc_ushort8 vec0, vec1, result;
+    float4 b0, b1;
+    float4 dst0, dst1;
+
+    VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src0, vec0, 16);
+    VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src1, vec1, 16);
+    b0 = read_imagef(input2, coord);
+    coord.x += 4;
+    b1 = read_imagef(input2, coord);
+    coord.x -= 4;
+
+    VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4);
+    VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4);
+    dst0 += b0;
+    dst1 += b1;
+
+    half4 t0, t1;
+    _viv_asm(CONV, t0, dst0);
+    _viv_asm(CONV, t1, dst1);
+    VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);
+    _viv_asm(COPY, result, dst, 16);
+
+    VXC_WriteImage2DArray(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void a_times_b_plus_c_F16_F16_F32toF16_2D
+    (
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __read_only image2d_t         input2,
+    __write_only image2d_array_t  output
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    vxc_half8   src0, src1, dst;
+    vxc_ushort8 vec0, vec1, result;
+    float4 b0, b1;
+    float4 dst0, dst1;
+
+    VXC_ReadImage(vec0, input0, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src0, vec0, 16);
+    VXC_ReadImage(vec1, input1, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src1, vec1, 16);
+    b0 = read_imagef(input2, coord.xy);
+    coord.z = coord.x + 4;
+    b1 = read_imagef(input2, coord.zy);
+
+    VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4);
+    VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4);
+    dst0 += b0;
+    dst1 += b1;
+
+    half4 t0, t1;
+    _viv_asm(CONV, t0, dst0);
+    _viv_asm(CONV, t1, dst1);
+    VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);
+    _viv_asm(COPY, result, dst, 16);
+
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
index e36dfdb..90b5135 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
@@ -1,10 +1,11 @@
 #include "cl_viv_vx_ext.h"
 
 _viv_uniform int indices_num;
+_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;
 
 __kernel void gather_I8toI8(
     __read_only image2d_t   input0,
-    __read_only image2d_array_t   input1,
+    __read_only image2d_t   input1,
     __write_only image2d_t  output,
     int block_size,
     int block_num,
@@ -16,7 +17,7 @@ __kernel void gather_I8toI8(
     int gidz = get_global_id(2);  // block_num
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
-    int4 indice = read_imagei(input1, coord_in.xyyy);
+    int4 indice = read_imagei(input1, coord_in.xy);
     coord_in.w = gidz * axis_num + indice.x;
 
     vxc_char16 src;
@@ -28,7 +29,7 @@ __kernel void gather_I8toI8(
 
 __kernel void gather_U8toU8(
     __read_only image2d_t   input0,
-    __read_only image2d_array_t   input1,
+    __read_only image2d_t   input1,
     __write_only image2d_t  output,
     int block_size,
     int block_num,
@@ -40,7 +41,7 @@ __kernel void gather_U8toU8(
     int gidz = get_global_id(2);  // block_num
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
-    int4 indice = read_imagei(input1, coord_in.xyyy);
+    int4 indice = read_imagei(input1, coord_in.xy);
     coord_in.w = gidz * axis_num + indice.x;
 
     vxc_uchar16 src;
@@ -52,7 +53,7 @@ __kernel void gather_U8toU8(
 
 __kernel void gather_I16toI16(
     __read_only image2d_t   input0,
-    __read_only image2d_array_t   input1,
+    __read_only image2d_t   input1,
     __write_only image2d_t  output,
     int block_size,
     int block_num,
@@ -66,7 +67,7 @@ __kernel void gather_I16toI16(
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
 
 
-    int4 indice = read_imagei(input1, coord_in.xyyy);
+    int4 indice = read_imagei(input1, coord_in.xy);
     coord_in.w = gidz * axis_num + indice.x;
 
     vxc_short8 src;
@@ -78,7 +79,7 @@ __kernel void gather_I16toI16(
 
 __kernel void gather_F16toF16(
     __read_only image2d_t   input0,
-    __read_only image2d_array_t   input1,
+    __read_only image2d_t   input1,
     __write_only image2d_t  output,
     int block_size,
     int block_num,
@@ -92,7 +93,7 @@ __kernel void gather_F16toF16(
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
 
 
-    int4 indice = read_imagei(input1, coord_in.xyyy);
+    int4 indice = read_imagei(input1, coord_in.xy);
     coord_in.w = gidz * axis_num + indice.x;
 
     vxc_short8 src;
@@ -101,3 +102,107 @@ __kernel void gather_F16toF16(
     int2 coord = (int2)(gidx, gidz * indices_num + gidy);
     VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void gather_I8toI8_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+    int4 indices = read_imagei(input1, coord.xx);
+    int2 coord_in = (int2)(indices.x, get_global_id(1));
+
+    vxc_char16 src, dst;
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    indices.x = get_global_id(1);
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_U8toU8_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+    int4 indices = read_imagei(input1, coord.xx);
+    int2 coord_in = (int2)(indices.x, get_global_id(1));
+
+    vxc_uchar16 src, dst;
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    indices.x = get_global_id(1);
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_I16toI16_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+    int4 indices = read_imagei(input1, coord.xx);
+    int2 coord_in = (int2)(indices.x, get_global_id(1));
+
+    vxc_short8 src, dst;
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    indices.x = get_global_id(1);
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_F16toF16_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+    int4 indices = read_imagei(input1, coord.xx);
+    int2 coord_in = (int2)(indices.x, get_global_id(1));
+
+    vxc_short8 src, dst;
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    indices.x = get_global_id(1);
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
index e3950b1..e9b8fd1 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
@@ -11,7 +11,7 @@ _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
 #define GATHER_8BITS_TO_F16(src0_type_name, read_type) \
 __kernel void gather_##src0_type_name##toF16( \
     __read_only image2d_t   input0, \
-    __read_only image2d_array_t   input1, \
+    __read_only image2d_t   input1, \
     __write_only image2d_t  output, \
     int block_size, \
     int block_num, \
@@ -23,7 +23,7 @@ __kernel void gather_##src0_type_name##toF16( \
     int gidz = get_global_id(2); \
  \
     int4 coord_in = (int4)(gidy, 0, gidx, 0); \
-    int4 indice = read_imagei(input1, coord_in.xyyy); \
+    int4 indice = read_imagei(input1, coord_in.xy); \
     coord_in.w = gidz * axis_num + indice.x; \
  \
     read_type src; \
@@ -47,7 +47,7 @@ GATHER_8BITS_TO_F16(I8, vxc_char16)
 #define GATHER_F16_TO_QINT(src1_type_name, write_type) \
 __kernel void gather_F16to##src1_type_name( \
     __read_only image2d_t   input0, \
-    __read_only image2d_array_t   input1, \
+    __read_only image2d_t   input1, \
     __write_only image2d_t  output, \
     int block_size, \
     int block_num, \
@@ -59,7 +59,7 @@ __kernel void gather_F16to##src1_type_name( \
     int gidz = get_global_id(2); \
     int4 coord_in = (int4)(gidy, 0, gidx, 0); \
  \
-    int4 indice = read_imagei(input1, coord_in.xyyy); \
+    int4 indice = read_imagei(input1, coord_in.xy); \
     coord_in.w = gidz * axis_num + indice.x; \
  \
     vxc_short8 src; \
@@ -79,7 +79,7 @@ GATHER_F16_TO_QINT(I16, vxc_short8)
 
 __kernel void gather_I16toF16(
     __read_only image2d_t   input0,
-    __read_only image2d_array_t   input1,
+    __read_only image2d_t   input1,
     __write_only image2d_t  output,
     int block_size,
     int block_num,
@@ -91,7 +91,7 @@ __kernel void gather_I16toF16(
     int gidz = get_global_id(2);
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
-    int4 indice = read_imagei(input1, coord_in.xyyy);
+    int4 indice = read_imagei(input1, coord_in.xy);
     coord_in.w = gidz * axis_num + indice.x;
 
     vxc_short8 src;
@@ -109,3 +109,97 @@ __kernel void gather_I16toF16(
 
     VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
+
+#define GATHER_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \
+__kernel void gather_##src0_type_name##toF16_axis0( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    int4 indices = read_imagei(input1, coord.xx); \
+    int2 coord_in = (int2)(indices.x, get_global_id(1)); \
+ \
+    read_type src; \
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    indices.x = get_global_id(1); \
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    vxc_half8  src0; \
+    vxc_short8 dst0; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
+    _viv_asm(COPY, dst0, src0, 16); \
+    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_8BITS_TO_F16_AXIS0(U8, vxc_uchar16)
+GATHER_8BITS_TO_F16_AXIS0(I8, vxc_char16)
+
+#define GATHER_F16_TO_QINT_AXIS0(src1_type_name, write_type) \
+__kernel void gather_F16to##src1_type_name##_axis0( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    int4 indices = read_imagei(input1, coord.xx); \
+    int2 coord_in = (int2)(indices.x, get_global_id(1)); \
+ \
+    vxc_short8 src; \
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    indices.x = get_global_id(1); \
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    vxc_half8 data; \
+    write_type dst; \
+    _viv_asm(COPY, data, src, 16); \
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_F16_TO_QINT_AXIS0(U8, vxc_uchar16)
+GATHER_F16_TO_QINT_AXIS0(I8, vxc_char16)
+GATHER_F16_TO_QINT_AXIS0(I16, vxc_short8)
+
+__kernel void gather_I16toF16_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+    int4 indices = read_imagei(input1, coord.xx);
+    int2 coord_in = (int2)(indices.x, get_global_id(1));
+
+    vxc_short8 src;
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    indices.x = get_global_id(1);
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    vxc_half8  src0;
+    vxc_short8 dst0;
+    vxc_ushort8 ms0;
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);
+    VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniU8MulAndPostShift_0_Lo_2x8);
+    _viv_asm(COPY, dst0, src0, 16);
+
+    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx
new file mode 100644
index 0000000..7a796a2
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx
@@ -0,0 +1,279 @@
+#include "cl_viv_vx_ext.h"
+
+/**************************layernorm float16***********************************/
+_viv_uniform int width;
+_viv_uniform float dimRatio;
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;
+
+__kernel void layer_norm_F16toF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    vxc_short8 src0, src1;
+    vxc_float sum = 0, sqr = 0;
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
+    {
+        vxc_half8  val0_h;
+        _viv_asm(COPY, val0_h, src0, 16);
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+            uniFp16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr += sumsqr.y;
+    }
+    vxc_float mean;
+    mean = sum * dimRatio;
+    vxc_float vari;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4 bias_f;
+    for(coord.x = 0; coord.x < width; coord.x += 4)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        bias_f = read_imagef(bias, coord.xw);
+        vxc_half8 in_h, scale_h;
+        _viv_asm(COPY, in_h, src0, 16);
+        _viv_asm(COPY, scale_h, src1, 16);
+        vxc_float4 in_f, scale_f;
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        vxc_float4 sub, norm;
+        sub = in_f - mean;
+        norm = scale_f * vari * sub + bias_f;
+        half4 norm_h;
+        _viv_asm(CONV, norm_h, norm);
+        vxc_half8 dst;
+        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniExtractHalf4_dp4x4);
+        vxc_short8 dstval;
+        _viv_asm(COPY, dstval, dst, 16);
+        coord_out.x = coord.x;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    }
+}
+/*****************************layernorm uint8 to uint8****************************/
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniSumU8_16x1;
+_viv_uniform VXC_512Bits uniSqrSum_16x1;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+_viv_uniform int sumInZp;
+_viv_uniform int tmpZp1;
+_viv_uniform int tmpZp2;
+_viv_uniform float e2InScale;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+__kernel void layer_norm_U8toU8(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    vxc_uchar16 src0, src2;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    float sum = 0, sqr = 0;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    int tmpSum = 0, tmpSqr = 0;
+    vxc_int4 tmpSum1;
+    vxc_int4 tmpSqr1;
+    short zp = inputZP;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+        tmpSum += (tmpSum1.x);
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
+    }
+    sum = (tmpSum + sumInZp) * input_scale;
+    sqr = (tmpSqr + tmpZp2) * e2InScale;
+
+    float mean, vari;
+    mean = sum * dimRatio;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+    int2 coord_bias = (int2)(0, 0);
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.x = coord.x;
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert2ndUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert3rdUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert4thUint8SubZpToFp32_4x4);
+        tmpData0 *= input_scale;
+        tmpData1 *= input_scale;
+        tmpData2 *= input_scale;
+        tmpData3 *= input_scale;
+
+        vxc_float4 norm;
+        tmpData0 -= mean;
+        norm = scale_f0 * vari * tmpData0 + bias_f0;
+        bias_f0 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        coord_bias.x += 4;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+
+        tmpData1 -= mean;
+        norm = scale_f1 * vari * tmpData1 + bias_f1;
+        bias_f1 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+
+        tmpData2 -= mean;
+        norm = scale_f0 * vari * tmpData2 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+
+        tmpData3 -= mean;
+        norm = scale_f1 * vari * tmpData3 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+        coord_out.x = coord.x;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \
+                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    }
+}
+/***************************layernorm float16 to uint8**************************/
+__kernel void layer_norm_F16toU8(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    vxc_short8 src0, src1;
+    vxc_float sum = 0, sqr = 0;
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
+    {
+        vxc_half8  val0_h;
+        _viv_asm(COPY, val0_h, src0, 16);
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+            uniFp16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr += sumsqr.y;
+    }
+    vxc_float mean;
+    mean = sum * dimRatio;
+    vxc_float vari;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4 bias_f;
+    for(coord.x = 0; coord.x < width; coord.x += 4)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        bias_f = read_imagef(bias, coord.xw);
+        vxc_half8 in_h, scale_h;
+        _viv_asm(COPY, in_h, src0, 16);
+        _viv_asm(COPY, scale_h, src1, 16);
+        vxc_float4 in_f, scale_f;
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        vxc_float4 sub, norm;
+        sub = in_f - mean;
+        norm = scale_f * vari * sub + bias_f;
+        norm = norm * outputScale + output_zp;
+        int4 output_int4;
+        output_int4 = convert_int4_rte(norm);
+        vxc_uchar8 dst;
+        VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),
+            uniConvertInt32toUint8_2x8);
+        coord_out.x = coord.x;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx
similarity index 87%
rename from src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize.vx
rename to src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx
index db424ad..d517d7d 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx
@@ -7,12 +7,9 @@ _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
 _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
 _viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;
 
-__kernel void vxcLayerNorm(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t output,
-              float eps)
+__kernel void layer_norm_F16toF16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
 {
     int4 coord = (int4)(0, get_global_id(1), 0, 0);
     vxc_short8 src0, src1;
@@ -44,7 +41,7 @@ __kernel void vxcLayerNorm(
             VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
         VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
             VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        bias_f = read_imagef(bias, coord.xwww);
+        bias_f = read_imagef(bias, coord.xw);
         vxc_half8 in_h, scale_h;
         _viv_asm(COPY, in_h, src0, 16);
         _viv_asm(COPY, scale_h, src1, 16);
@@ -76,7 +73,7 @@ _viv_uniform VXC_512Bits uniSqrSum_16x1;
 _viv_uniform float input_scale;
 _viv_uniform int inputZP;
 _viv_uniform float outputScale;
-_viv_uniform int output_ZP;
+_viv_uniform float output_zp;
 _viv_uniform int sumInZp;
 _viv_uniform int tmpZp1;
 _viv_uniform int tmpZp2;
@@ -84,12 +81,9 @@ _viv_uniform float e2InScale;
 _viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
 
-__kernel void vxcLayerNorm_u8(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t output,
-              float eps)
+__kernel void layer_norm_U8toU8_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
 {
     int4 coord = (int4)(0, get_global_id(1), 0, 0);
     vxc_uchar16 src0, src2;
@@ -121,15 +115,15 @@ __kernel void vxcLayerNorm_u8(
     vari = rsqrt(vari);
     vxc_int4 tmpVal0, tmpVal1;
     vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    int4 coord_bias = (int4)(0, 0, 0, 0);
+    int2 coord_bias = (int2)(0, 0);
 
     for(coord.x = 0; coord.x < width; coord.x += 16)
     {
-        coord_bias.x = coord.x;
         VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
         VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.x = coord.x;
         _viv_asm(COPY, scale_h, src1, 16);
         VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
             UniFP16toFP32Lo4_dp4x4);
@@ -151,49 +145,41 @@ __kernel void vxcLayerNorm_u8(
             uniConvert3rdUint8SubZpToFp32_4x4);
         VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
             uniConvert4thUint8SubZpToFp32_4x4);
-        tmpData0 *= input_scale;
-        tmpData1 *= input_scale;
-        tmpData2 *= input_scale;
-        tmpData3 *= input_scale;
+        tmpData0 = tmpData0 * input_scale - mean;
+        tmpData1 = tmpData1 * input_scale - mean;
+        tmpData2 = tmpData2 * input_scale - mean;
+        tmpData3 = tmpData3 * input_scale - mean;
 
         vxc_float4 norm;
-        tmpData0 -= mean;
         norm = scale_f0 * vari * tmpData0 + bias_f0;
         bias_f0 = read_imagef(bias, coord_bias);
         VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
             UniFP16toFP32Lo4_dp4x4);
         coord_bias.x += 4;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
 
-        tmpData1 -= mean;
         norm = scale_f1 * vari * tmpData1 + bias_f1;
         bias_f1 = read_imagef(bias, coord_bias);
         VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
             uniConvertSecFp16Fp32_4x4);
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
         VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
             uniConvertInt32toUint8_2x8);
 
-        tmpData2 -= mean;
         norm = scale_f0 * vari * tmpData2 + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
 
-        tmpData3 -= mean;
         norm = scale_f1 * vari * tmpData3 + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
         VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
             uniConvertInt32toUint8_2x8);
         VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
     }
 }
 /***************************layernorm float16 to uint8**************************/
-_viv_uniform float outputZP;
-__kernel void vxcLayerNormFP16toU8(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t output,
-              float eps)
+__kernel void layer_norm_F16toU8_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
 {
     int4 coord = (int4)(0, get_global_id(1), 0, 0);
     vxc_short8 src0, src1;
@@ -225,7 +211,7 @@ __kernel void vxcLayerNormFP16toU8(
             VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
         VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
             VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        bias_f = read_imagef(bias, coord.xwww);
+        bias_f = read_imagef(bias, coord.xw);
         vxc_half8 in_h, scale_h;
         _viv_asm(COPY, in_h, src0, 16);
         _viv_asm(COPY, scale_h, src1, 16);
@@ -237,7 +223,7 @@ __kernel void vxcLayerNormFP16toU8(
         vxc_float4 sub, norm;
         sub = in_f - mean;
         norm = scale_f * vari * sub + bias_f;
-        norm = norm * outputScale + outputZP;
+        norm = norm * outputScale + output_zp;
         int4 output_int4;
         output_int4 = convert_int4_rte(norm);
         vxc_uchar8 dst;
@@ -245,4 +231,4 @@ __kernel void vxcLayerNormFP16toU8(
             uniConvertInt32toUint8_2x8);
         VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
     }
-}
\ No newline at end of file
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx
new file mode 100644
index 0000000..bedc979
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx
@@ -0,0 +1,167 @@
+#include "cl_viv_vx_ext.h"
+
+/**************************layernorm float16***********************************/
+_viv_uniform int width;
+_viv_uniform float dimRatio;
+_viv_uniform float dimRatio_scale;
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform float e2InScale;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+
+__kernel void layer_norm_I16toI16(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+
+    vxc_short8 src0, src1, dst;
+    vxc_float sum = 0, sqr = 0;
+    for(; coord.x < width;)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.x += 8;
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                    uniInt16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr = sqr + sumsqr.y * e2InScale;
+    }
+    vxc_float mean;
+    mean = sum * dimRatio_scale;
+    vxc_float vari;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_half8 scale_h;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    int2 coord_bias = (int2)(0, 0);
+
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.x = coord.x;
+        VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 * input_scale - mean;
+        norm = scale_f0 * vari * sub + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        sub = tmpData1 * input_scale - mean;
+        norm = scale_f1 * vari * sub + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void layer_norm_I16toI16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(0, get_global_id(1));
+
+    vxc_short8 src0, src1, dst;
+    vxc_float sum = 0, sqr = 0;
+    for(; coord.x < width;)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.x += 8;
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                    uniInt16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr = sqr + sumsqr.y * e2InScale;
+    }
+    vxc_float mean, vari;
+    mean = sum * dimRatio_scale;
+    vari = sqr * dimRatio - mean * mean;
+    vari += eps;
+    vari = rsqrt(vari);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_half8 scale_h;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    int2 coord_bias = (int2)(0, 0);
+
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.x = coord.x;
+        VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                    UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                    uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 * input_scale - mean;
+        norm = scale_f0 * vari * sub + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        sub = tmpData1 * input_scale - mean;
+        norm = scale_f1 * vari * sub + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                    uniConvertInt32toUint8_2x8);
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx
new file mode 100644
index 0000000..d7d7066
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx
@@ -0,0 +1,252 @@
+#include "cl_viv_vx_ext.h"
+
+/*****************************layernorm uint8 to fp16****************************/
+_viv_uniform int width;
+_viv_uniform float dimRatio;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniSumU8_16x1;
+_viv_uniform VXC_512Bits uniSqrSum_16x1;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+_viv_uniform int sumInZp;
+_viv_uniform int tmpZp1;
+_viv_uniform int tmpZp2;
+_viv_uniform float e2InScale;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits UniPackFP16even_2x8;
+
+__kernel void layer_norm_U8toF16(
+    image2d_array_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_array_t output,
+              float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+    vxc_uchar16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0;
+    vxc_int4 tmpSum1;
+    vxc_int4 tmpSqr1;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+        tmpSum += (tmpSum1.x);
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
+    }
+    sum = (tmpSum + sumInZp) * input_scale;
+    sqr = (tmpSqr + tmpZp2) * e2InScale;
+
+    float mean, vari;
+    mean = sum * dimRatio;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+    int2 coord_bias = (int2)(0, 0);
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_short8 src1, outval;
+    short zp = inputZP;
+    half4 tmpVal0, tmpVal1;
+    vxc_half8 dst;
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.x = coord.x;
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert2ndUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert3rdUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert4thUint8SubZpToFp32_4x4);
+        tmpData0 *= input_scale;
+        tmpData1 *= input_scale;
+        tmpData2 *= input_scale;
+        tmpData3 *= input_scale;
+
+        vxc_float4 norm;
+        tmpData0 -= mean;
+        norm = scale_f0 * vari * tmpData0 + bias_f0;
+        bias_f0 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        coord_bias.x += 4;
+        _viv_asm(CONV, tmpVal0, norm);
+
+        tmpData1 -= mean;
+        norm = scale_f1 * vari * tmpData1 + bias_f1;
+        bias_f1 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            UniPackFP16even_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        coord_out.x = coord.x;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+        tmpData2 -= mean;
+        norm = scale_f0 * vari * tmpData2 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+
+        tmpData3 -= mean;
+        norm = scale_f1 * vari * tmpData3 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            UniPackFP16even_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        coord_out.x += 8;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void layer_norm_U8toF16_2D(
+    image2d_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_t output,
+        float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);
+    vxc_uchar16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0;
+    vxc_int4 tmpSum1;
+    vxc_int4 tmpSqr1;
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+        tmpSum += (tmpSum1.x);
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
+    }
+    sum = (tmpSum + sumInZp) * input_scale;
+    sqr = (tmpSqr + tmpZp2) * e2InScale;
+
+    float mean, vari;
+    mean = sum * dimRatio;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+    int2 coord_bias = (int2)(0, 0);
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_short8 src1, outval;
+    short zp = inputZP;
+    half4 tmpVal0, tmpVal1;
+    vxc_half8 dst;
+
+    int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        coord_bias.x = coord.x;
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert2ndUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert3rdUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert4thUint8SubZpToFp32_4x4);
+        tmpData0 *= input_scale;
+        tmpData1 *= input_scale;
+        tmpData2 *= input_scale;
+        tmpData3 *= input_scale;
+
+        vxc_float4 norm;
+        tmpData0 -= mean;
+        norm = scale_f0 * vari * tmpData0 + bias_f0;
+        bias_f0 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        coord_bias.x += 4;
+        _viv_asm(CONV, tmpVal0, norm);
+
+        tmpData1 -= mean;
+        norm = scale_f1 * vari * tmpData1 + bias_f1;
+        bias_f1 = read_imagef(bias, coord_bias);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            UniPackFP16even_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        coord_out.x = coord.x;
+        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+        tmpData2 -= mean;
+        norm = scale_f0 * vari * tmpData2 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+
+        tmpData3 -= mean;
+        norm = scale_f1 * vari * tmpData3 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            UniPackFP16even_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        coord_out.x += 8;
+        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx
new file mode 100644
index 0000000..03802e8
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx
@@ -0,0 +1,426 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
+_viv_uniform int width;
+
+_viv_uniform int height;
+
+_viv_uniform int height_depth;
+_viv_uniform float dimRatio;
+_viv_uniform int group_num;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    vxc_float4 sumsqr;
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            _viv_asm(COPY, in_h, src0, 16);
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                uniFp16SumSqr_dp8x2);
+            tmpSumSqr += sumsqr;
+        }
+    }
+
+    lcl_sum[lidx] = tmpSumSqr.x;
+    lcl_sqr[lidx] = tmpSumSqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        float sum = 0;
+        float sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32_2D(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int gidy = gidz * height;
+
+    int2 coord = (int2)(gidx, gidy);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    vxc_float4 sumsqr;
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int endH = gidy + height;
+    if(gidx < width)
+    {
+        for(; coord.y < endH;)
+        {
+            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            _viv_asm(COPY, in_h, src0, 16);
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                uniFp16SumSqr_dp8x2);
+            tmpSumSqr += sumsqr;
+        }
+    }
+
+    lcl_sum[lidx] = tmpSumSqr.x;
+    lcl_sqr[lidx] = tmpSumSqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        float sum = 0;
+        float sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    vxc_half8 dst;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_para.y = coord.y;
+        coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 - mean_vari.s0;
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        sub = tmpData1 - mean_vari.s0;
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                    uniConvertHalfToFp16_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+    int2 coord_bias = (int2)(0, 0);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_bias);
+        coord_bias.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    coord_bias = coord;
+
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    vxc_half8 dst;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.y = coord.y;
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 - mean_vari.s0;
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        sub = tmpData1 - mean_vari.s0;
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                    uniConvertHalfToFp16_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1;
+    vxc_uchar16 outval;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_para.y = coord.y;
+        coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 - mean_vari.s0;
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        sub = tmpData1 - mean_vari.s0;
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8_2D(
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+    int2 coord_bias = (int2)(0, 0);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_bias);
+        coord_bias.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    coord_bias = coord;
+
+    vxc_float4  tmpData0, tmpData1;
+    vxc_uchar16 outval;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.y = coord.y;
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 - mean_vari.s0;
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        sub = tmpData1 - mean_vari.s0;
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx
new file mode 100644
index 0000000..61e4e29
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx
@@ -0,0 +1,266 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
+_viv_uniform float e2InScale;
+_viv_uniform int width;
+
+_viv_uniform float input_scale;
+_viv_uniform int height;
+
+_viv_uniform int height_depth;
+_viv_uniform float dimRatio;
+_viv_uniform int group_num;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform int inputZP;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_short8 src0;
+    float4 tmpSumSqr = (float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            vxc_float4 sumsqr;
+            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                    uniInt16SumSqr_dp8x2);
+            tmpSumSqr += sumsqr;
+        }
+        tmpSumSqr.x *= input_scale;
+        tmpSumSqr.y *= e2InScale;
+    }
+    lcl_sum[lidx] = tmpSumSqr.x;
+    lcl_sqr[lidx] = tmpSumSqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        float4 data = (float4)(0);
+        for(int i = 0; i < 4; i++)
+        {
+            data.x += dot(tmp_sum[i], one);
+            data.y += dot(tmp_sqr[i], one);
+        }
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32_2D(
+    image2d_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int gidy = gidz * height;
+
+    int2 coord = (int2)(gidx, gidy);
+    vxc_short8 src0;
+    float4 tmpSumSqr = (float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int endH = gidy + height;
+    if(gidx < width)
+    {
+        for(; coord.y < endH;)
+        {
+            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            vxc_float4 sumsqr;
+            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                    uniInt16SumSqr_dp8x2);
+            tmpSumSqr += sumsqr;
+        }
+        tmpSumSqr.x *= input_scale;
+        tmpSumSqr.y *= e2InScale;
+    }
+    lcl_sum[lidx] = tmpSumSqr.x;
+    lcl_sqr[lidx] = tmpSumSqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        float4 data = (float4)(0);
+        for(int i = 0; i < 4; i++)
+        {
+            data.x += dot(tmp_sum[i], one);
+            data.y += dot(tmp_sqr[i], one);
+        }
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);
+    vxc_short8 src0, src1, outval;
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_para.y = coord.y;
+        coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+    int2 coord_bias = (int2)(0, 0);
+    vxc_short8 src0, src1, outval;
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_bias);
+        coord_bias.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    coord_bias = coord;
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.y = coord.y;
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx
new file mode 100644
index 0000000..521a8cf
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx
@@ -0,0 +1,419 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniSumU8_16x1;
+_viv_uniform VXC_512Bits uniSqrSum_16x1;
+_viv_uniform int sumInZp;
+_viv_uniform int tmpZp1;
+_viv_uniform float e2InScale;
+_viv_uniform float rowSumScale;
+_viv_uniform int width;
+
+_viv_uniform float input_scale;
+_viv_uniform int height;
+
+_viv_uniform int height_depth;
+_viv_uniform float dimRatio;
+_viv_uniform int group_num;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform int inputZP;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_uchar16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+            tmpSum += (tmpSum1);
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
+        }
+        sqr += (tmpSqr * e2InScale + rowSumScale);
+        sum = (tmpSum + sumInZp) * input_scale;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32_2D(
+    image2d_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int gidy = gidz * height;
+
+    int2 coord = (int2)(gidx, gidy);
+    vxc_uchar16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int endH = gidy + height;
+    if(gidx < width)
+    {
+        for(; coord.y < endH;)
+        {
+            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+            tmpSum += (tmpSum1);
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
+        }
+        sqr += (tmpSqr * e2InScale + rowSumScale);
+        sum = (tmpSum + sumInZp) * input_scale;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);
+    vxc_uchar16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    half4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_para.y = coord.y; coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniConvertHalfToFp16_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+    int2 coord_bias = (int2)(0, 0);
+    vxc_uchar16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_bias);
+        coord_bias.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    coord_bias = coord;
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    half4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.y = coord.y;
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniConvertHalfToFp16_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);
+    vxc_uchar16 src0 , outval;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_para.y = coord.y;
+        coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8_2D(
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_t output, float eps)
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+    int2 coord_bias = (int2)(0, 0);
+    vxc_uchar16 src0, outval;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_bias);
+        coord_bias.x += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    coord_bias = coord;
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_bias.y = coord.y;
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvertSecFp16Fp32_4x4);
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx
deleted file mode 100644
index 0ce3d53..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx
+++ /dev/null
@@ -1,136 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;
-_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
-
-_viv_uniform VXC_512Bits uniBilinearTmp1BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp2BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp3BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp4BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp5BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp6BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp7BgraShort_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp8BgraShort_4x4;
-
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniExtractInt32BgraToU8Bgr_2x8;
-
-_viv_uniform int zp;
-_viv_uniform float outputScale;
-
-__kernel void pre_process_bgra_scale_nhwc_U8toU8(
-    __read_only image2d_array_t input, __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    int4 gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    gidx += (int4)(0, 1, 2, 3);
-
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
-    int4 sx = fx & 0xffff8000; // Floor
-    int fy, sy;
-    fx -= sx;
-    sx = sx >> 15;
-    fx = (fx +(1 << 4)) >> 5;
-
-    // for y
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
-    sy = fy & 0xffff8000; // Floor
-    fy -= sy;
-    sy = sy >> 15;
-
-    sy = sy < 0 ? 0 : sy;
-    fy = fy < 0 ? 0 : fy;
-
-    fy = (fy + (1<< 4)) >> 5;
-    sx = (sx + (*xOffset)) * 4 ;
-    sy += (*yOffset);
-    int4 srcPos = (int4)(sx.x, sy, sy + 1, sx.y);
-    vxc_uchar16 lineBGRA0, lineBGRA1, lineBGRA2, lineBGRA3;
-    vxc_uchar16 dataB, dataG, dataR;
-
-    VXC_ReadImage(lineBGRA0, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(lineBGRA0, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(lineBGRA1, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(lineBGRA1, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.z;
-    srcPos.w = sx.w;
-
-    VXC_ReadImage(lineBGRA2, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(lineBGRA2, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(lineBGRA3, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(lineBGRA3, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_uchar4 val_u8;
-    int4 tmp1, tmp2, result1, result2;
-    float4 tmpDst, tmp0;
-    float4 mean = (float4)(bMean, gMean, rMean, 0);
-    //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);
-    int tmpV = 1 << 19;
-    vxc_short8 tmpFx;
-    VXC_DP2x8(tmpFx, fx, fx, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
-                 uniConvertInt32toUint8_2x8);
-    //tmpFx = fx.xxxx;
-    VXC_DP4x4(tmp1, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
-                 uniBilinearTmp1BgraShort_4x4);
-    VXC_DP4x4(tmp2, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
-                 uniBilinearTmp2BgraShort_4x4);
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
-        uniConvertIntergetoF32_4x4);
-    tmpDst = (tmp0 - mean) * var;
-    result1 = convert_int4_rte(tmpDst * outputScale + zp);
-
-    //tmpFx = fx.yyyy;
-    VXC_DP4x4(tmp1, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3BgraShort_4x4);
-    VXC_DP4x4(tmp2, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4BgraShort_4x4);
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
-        uniConvertIntergetoF32_4x4);
-    tmpDst = (tmp0 - mean) * var;
-    result2 = convert_int4_rte(tmpDst * outputScale + zp);
-
-    vxc_uchar16 dst;
-    VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1),
-                 uniExtractInt32BgraToU8Bgr_2x8);
-
-    //tmpFx = fx.zzzz;
-    VXC_DP4x4(tmp1, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp5BgraShort_4x4);
-    VXC_DP4x4(tmp2, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp6BgraShort_4x4);
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
-        uniConvertIntergetoF32_4x4);
-    tmpDst = (tmp0 - mean) * var;
-    result1 = convert_int4_rte(tmpDst * outputScale + zp);
-
-    //tmpFx = fx.wwww;
-    VXC_DP4x4(tmp1, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp7BgraShort_4x4);
-    VXC_DP4x4(tmp2, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp8BgraShort_4x4);
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
-        uniConvertIntergetoF32_4x4);
-    tmpDst = (tmp0 - mean) * var;
-    result2 = convert_int4_rte(tmpDst * outputScale + zp);
-
-    VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(6, 11, 0, VXC_RM_ToNearestEven, 1),
-                 uniExtractInt32BgraToU8Bgr_2x8);
-
-    int4 dstPos = (int4)(get_global_id(0) * 3, gidy, 0, 0);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx
deleted file mode 100644
index e235c7f..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-
-_viv_uniform float outputScaleVar;
-_viv_uniform float bMeanScaleVarZp;
-_viv_uniform float gMeanScaleVarZp;
-_viv_uniform float rMeanScaleVarZp;
-
-_viv_uniform uint xrIntFloat_16;
-_viv_uniform uint yrIntFloat_16;
-
-_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
-_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
-_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
-
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
-
-_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;
-_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;
-
-__kernel void pre_process_nv12_trans_U8toU8(
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,
-    __write_only image2d_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    uint4 gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    gidx += (uint4)(0, 1, 2, 3);
-
-    uint dy = (gidy * yrIntFloat_16) >> 16;
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;
-    int sy = convert_int(dy) + (*yOffset);
-    int4 sx = convert_int4(dx) + (*xOffset);
-    int4 uvX = sx & 0xfffffffe;
-    int uvY = sy >> 1;
-
-    vxc_uchar16 Y, UV;
-    int2 coord = (int2)(sx.x, sy);
-    int2 coord_uv = (int2)(uvX.x, uvY);
-
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.y;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.z;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.w;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.y;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.z;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.w;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_char16 tmpUV;
-    short tmpVal = 128;
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
-
-    float4 tmpDstB, tmpDstG, tmpDstR;
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
-
-    int4 result, dstR, dstG, dstB;
-    vxc_uchar16 dst, tmpPack;
-    dstB = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
-    dstG = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
-    dstR = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
-
-    if(bOrder == 2)
-    {
-        int4 exchangeData = dstB;
-        dstB = dstR;
-        dstR = exchangeData;
-    }
-
-    VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);
-
-    int2 dstPos = (int2)(get_global_id(0) * 3, gidy);
-    VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx
deleted file mode 100644
index da337ab..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform float outputScale;
-_viv_uniform float outputZP;
-_viv_uniform VXC_512Bits uniNormilizationLo_2x8;
-_viv_uniform VXC_512Bits uniNormilizationHi_2x8;
-#define IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(dst_name, dst_type, copy_type) \
-__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \
-    ( \
-    __read_only image2d_array_t  input, \
-    __write_only image2d_array_t output, \
-         global int              *xRatio, \
-         global int              *yRatio, \
-         global int              *xOffset, \
-         global int              *yOffset, \
-                float            rMean, \
-                float            gMean, \
-                float            bMean, \
-                float            f32Var, \
-                int           reverse_channel, \
-                int           trans \
-    ) \
-{ \
-    int2 coord      = (int2)(get_global_id(0), get_global_id(1)); \
- \
-    coord.xy += (int2) (*xOffset, *yOffset); \
-    vxc_uchar16 src0, src1; \
-    dst_type   dst0, dst1; \
-    copy_type   dst; \
- \
-    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
- \
-    f32Var *= outputScale; \
-    float4  paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \
-        bMean * f32Var - outputZP, f32Var); \
-    half4 paramData_f16; \
-    _viv_asm(CONV, paramData_f16, paramData); \
- \
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0)); \
-    coord_out.z = coord_out.x + 8; \
- \
-    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
-        uniNormilizationLo_2x8); \
-    VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), \
-        uniNormilizationHi_2x8); \
-    _viv_asm(COPY, dst, dst0, 16); \
-    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-    _viv_asm(COPY, dst, dst1, 16); \
-    VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \
-}
-IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(I16, vxc_short8,  vxc_short8)
-IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(F16, vxc_half8,   vxc_short8)
-
-#define IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(dst_name, dst_type) \
-__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \
-    ( \
-    __read_only image2d_array_t  input, \
-    __write_only image2d_array_t output, \
-         global int              *xRatio, \
-         global int              *yRatio, \
-         global int              *xOffset, \
-         global int              *yOffset, \
-                float            rMean, \
-                float            gMean, \
-                float            bMean, \
-                float            f32Var, \
-                int              reverse_channel, \
-                int              trans \
-    ) \
-{ \
-    int2 coord      = (int2)(get_global_id(0), get_global_id(1)); \
-    coord.xy += (int2) (*xOffset, *yOffset); \
-    vxc_uchar16 src0, src1; \
-    dst_type dst; \
- \
-    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
- \
-    f32Var *= outputScale; \
-    float4  paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \
-        bMean * f32Var - outputZP, f32Var); \
- \
-    half4 paramData_f16; \
-    _viv_asm(CONV, paramData_f16, paramData); \
- \
-    int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \
- \
-    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
-        uniNormilizationLo_2x8); \
-    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), \
-        uniNormilizationHi_2x8); \
-    VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); \
-}
-IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(U8, vxc_uchar16)
-IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx
deleted file mode 100644
index 0820a03..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx
+++ /dev/null
@@ -1,172 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniVecShift10;
-_viv_uniform VXC_512Bits uniAddRShift;
-_viv_uniform VXC_512Bits uniGetTempVal;
-_viv_uniform VXC_512Bits uniExtractBytes;
-_viv_uniform VXC_512Bits uniUnpackToR;
-_viv_uniform VXC_512Bits uniUnpackToG;
-_viv_uniform VXC_512Bits uniUnpackToB;
-
-_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
-_viv_uniform float outputScale;
-_viv_uniform VXC_512Bits uniExtract8Data_2x8;
-_viv_uniform float outputZP;
-
-_viv_uniform VXC_512Bits uniRePackRGBLo_2x8;
-_viv_uniform VXC_512Bits uniRePackRGBHi_2x8;
-#define IMAGE_PRE_PROCESS_NHWC(dst_name, conv_type, dst_type, copy_type) \
-__kernel void pre_process_rgb_scale_nhwc_U8to##dst_name \
-    ( \
-__read_only image2d_array_t  input, \
-__write_only image2d_array_t output, \
-        global int           *xRatio, \
-        global int           *yRatio, \
-        global int           *xOffset, \
-        global int           *yOffset, \
-               float         rMean, \
-               float         gMean, \
-               float         bMean, \
-               float         f32Var, \
-               int           reverse_channel, \
-               int           trans \
-    ) \
-{ \
-    int2 ratioXY = (int2)(*xRatio, *yRatio); \
-    int4 xPos       = get_global_id(0); \
-    int yPos        = get_global_id(1); \
-    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
-    xPos += (int4)(0, 1, 2, 3); \
- \
-    /*x*/ \
-    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
-    int4 sx = fx0 & 0xffff8000; \
-    fx0 -= sx; \
-    sx = sx >> 15; \
- \
-    vxc_short4 fx; \
-    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \
-    /*y*/ \
-    int fy = yPos * ratioXY.y + ratioSufXY.y; \
-    int sy = fy & 0xffff8000; \
- \
-    fy -= sy; \
-    sy = sy >> 15; \
- \
-    fy = (fy + (1<< 4)) >> 5; \
- \
-    vxc_uchar16 line0RGB1, line0RGB2; \
-    vxc_uchar16 line1RGB3, line1RGB4; \
-    int4 coord; \
-    sx = sx * 3 + *xOffset; \
-    coord.xyz    = sx.xyz; \
-    coord.w        = sy + *yOffset; \
-    int2 coord1 = (int2)(sx.w, coord.w); \
-    VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
- \
-    VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
-        VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
-        VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
-        VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \
-        VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
- \
-    float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \
- \
-    bgrMean *= f32Var; \
- \
-    int4 test01, temp1; \
-    int4 test02, temp2; \
-    int4 tt; \
-    vxc_uchar4 val; \
-    int4 coord_out = (int4)(xPos.x * 3, yPos, xPos.x * 3 + 6, 0); \
- \
-    vxc_uchar8 line1, line2; \
- \
-    /*R*/ \
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \
- \
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp1 = temp1 + test01; \
- \
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp2 = temp2 + test02; \
-    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
- \
-    vxc_float4 tmp_dst; \
-    vxc_uchar4 u8_dst; \
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \
-    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
-        uniConvertIntergetoF32_4x4); \
- \
-    /*convert U8 to dst*/ \
-    dst_type dstRG, dstB, dst; \
-    tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \
-    tmp_dst = tmp_dst * outputScale + outputZP; \
-    conv_type dst0; \
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \
-    VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
- \
-    /*G*/ \
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \
- \
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp1 = temp1 + test01; \
- \
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp2 = temp2 + test02; \
-    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
- \
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \
-    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
-        uniConvertIntergetoF32_4x4); \
- \
-    tmp_dst = tmp_dst * f32Var - bgrMean.y; \
-    tmp_dst = tmp_dst * outputScale + outputZP; \
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \
-    VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
- \
-    /*B*/ \
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \
- \
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp1 = temp1 + test01; \
- \
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
-    temp2 = temp2 + test02; \
-    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
- \
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \
-    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
-        uniConvertIntergetoF32_4x4); \
- \
-    tmp_dst = tmp_dst * f32Var - bgrMean.x; \
-    tmp_dst = tmp_dst * outputScale + outputZP; \
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \
-    VXC_DP2x8(dstB, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
-    VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBLo_2x8); \
-    copy_type result; \
-    _viv_asm(COPY, result, dst, 16); \
-    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBHi_2x8); \
-    _viv_asm(COPY, result, dst, 16); \
-    VXC_WriteImage(output, coord_out.zy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
-}
-IMAGE_PRE_PROCESS_NHWC(U8,  uint4, vxc_uchar16, vxc_uchar16)
-IMAGE_PRE_PROCESS_NHWC(I8,  int4,  vxc_char16,  vxc_char16)
-IMAGE_PRE_PROCESS_NHWC(I16, int4,  vxc_short8,  vxc_short8)
-IMAGE_PRE_PROCESS_NHWC(F16, half4, vxc_half8,   vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx
index 4600537..951ee96 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx
@@ -23,19 +23,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;
 _viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;
 _viv_uniform VXC_512Bits uniCalculateB1st_4x4;
 
-_viv_uniform VXC_512Bits uniPackBG0_2x8;
-_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;
-_viv_uniform VXC_512Bits uniPackRB0_2x8;
-_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;
-_viv_uniform VXC_512Bits uniPackGR1_2x8;
-_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;
-_viv_uniform VXC_512Bits uniPackBG1_2x8;
-_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;
-_viv_uniform VXC_512Bits uniPackRB2_2x8;
-_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;
-_viv_uniform VXC_512Bits uniPackGR2_2x8;
-_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;
-
 _viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;
 _viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;
 _viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;
@@ -145,137 +132,3 @@ __kernel void pre_process_yuv420_copy_U8toU8(
     VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 }
 
-// store bgrbgrbgr
-__kernel void pre_process_yuv420_copy_trans_U8(
-    __read_only image2d_t            y_img,
-    __read_only image2d_t            u_img,
-    __read_only image2d_t            v_img,
-    __write_only image2d_array_t    output,
-        global int *                xRatio,
-        global int *                yRatio,
-        global int *               xOffset,
-        global int *               yOffset,
-               float                 rMean,
-               float                 gMean,
-               float                 bMean,
-               float                   var,
-               int         reverse_channel,
-               int                   trans
-    )
-{
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
-    int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0);
-    vxc_uchar16 Y;
-    vxc_uchar8 U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-    vxc_uchar16 dst;
-
-    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    var *= outputScale;
-    float4  paramData = (float4)(bMean * var - zp, gMean * var - zp,\
-        rMean * var - zp, var);
-    half4 paramData_f16;
-    _viv_asm(CONV, paramData_f16, paramData);
-
-    //C = Y - 16;
-    //D = U - 128;
-    //E = V - 128;
-    // calculate R
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
-
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
-
-    // calculate G
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG;
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP4x4(dst, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
-    VXC_DP4x4(dst, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
-    VXC_DP4x4(dst, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);
-    VXC_DP4x4(dst, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);
-
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
-
-    // calculate B
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
-
-    // reorder to bgr
-    vxc_uchar8 tmpdst0, tmpdst1;
-    vxc_uchar16 dst0, dst1, dst2;
-
-    if(bOrder == 2)
-    {
-        vxc_uchar16 exchangeData = B;
-        B = R;
-        R = exchangeData;
-    }
-
-    // BGR BGR BG
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);
-    VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);
-
-    // RBG RBG RB
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);
-    VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);
-
-    pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);
-
-    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.x += 16;
-
-    // GRB GRB GR
-    VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);
-    VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);
-
-    // BGR BGR BG
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);
-    VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);
-
-    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.x += 16;
-
-    // RBG RBG RB
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);
-    VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);
-
-    // GRB GRB GR
-    VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);
-    VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);
-
-    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx
deleted file mode 100644
index afb6bef..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx
+++ /dev/null
@@ -1,235 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
-
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
-
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
-
-_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;
-_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-_viv_uniform int zp;
-_viv_uniform float outputScale;
-
-__kernel void pre_process_yuv420_trans_U8toU8(
-    __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,
-    __read_only image2d_array_t v_img, __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    int4 gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    gidx += (int4)(0, 1, 2, 3);
-
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
-    int4 sx = fx & 0xffff8000; // Floor
-    int fy, sy;
-    fx -= sx;
-    sx = sx >> 15;
-    fx = (fx +(1 << 4)) >> 5;
-
-    // for y
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
-    sy = fy & 0xffff8000; // Floor
-    fy -= sy;
-    sy = sy >> 15;
-
-    sy = sy < 0 ? 0 : sy;
-    fy = fy < 0 ? 0 : fy;
-
-    fy = (fy + (1<< 4)) >> 5;
-    sx += (*xOffset);
-    sy += (*yOffset);
-    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);
-    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);
-    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);
-
-    vxc_uchar16 Y, U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.y;
-    srcPos1.x = sx.y >> 1;
-    srcPos2.x = sx.y >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.z;
-    srcPos1.x = sx.z >> 1;
-    srcPos2.x = sx.z >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.w;
-    srcPos1.x = sx.w >> 1;
-    srcPos2.x = sx.w >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-
-    //C = Y - 16; D = U - 128; E = V - 128;
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG, tmpDstG1;
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    int4 result, temp1, temp2, dstR, dstG, dstB;
-    int4 tmpData0, tmpData1;
-
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    // temp2 - temp1
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-
-    tmpV = 1 << 19;
-    vxc_uchar8 dst, tmpPack;
-    float4 tmpDst;
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - bMean) * var;
-    dstB = convert_int4_rte(tmpDst * outputScale + zp);
-
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - gMean) * var;
-    dstG = convert_int4_rte(tmpDst * outputScale + zp);
-
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - rMean) * var;
-    dstR = convert_int4_rte(tmpDst * outputScale + zp);
-
-    if(bOrder == 2)
-    {
-        int4 exchangeData = dstB;
-        dstB = dstR;
-        dstR = exchangeData;
-    }
-
-    VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);
-
-    int2 dstPos = (int2)(get_global_id(0) * 3, gidy);
-    VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
index ca99597..20803c9 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
@@ -22,19 +22,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;
 _viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;
 _viv_uniform VXC_512Bits uniCalculateB1st_4x4;
 
-_viv_uniform VXC_512Bits uniPackBG0_2x8;
-_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;
-_viv_uniform VXC_512Bits uniPackRB0_2x8;
-_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;
-_viv_uniform VXC_512Bits uniPackGR1_2x8;
-_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;
-_viv_uniform VXC_512Bits uniPackBG1_2x8;
-_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;
-_viv_uniform VXC_512Bits uniPackRB2_2x8;
-_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;
-_viv_uniform VXC_512Bits uniPackGR2_2x8;
-_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;
-
 _viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;
 _viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;
 _viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;
@@ -143,137 +130,3 @@ __kernel void pre_process_yuv444_copy_U8toU8(
     pos.z = rOrder;
     VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 }
-
-// store bgrbgrbgr
-__kernel void pre_process_yuv444_copy_trans_U8(
-    __read_only image2d_t            y_img,
-    __read_only image2d_t            u_img,
-    __read_only image2d_t            v_img,
-    __write_only image2d_array_t    output,
-        global int *                xRatio,
-        global int *                yRatio,
-        global int *               xOffset,
-        global int *               yOffset,
-               float                 rMean,
-               float                 gMean,
-               float                 bMean,
-               float                   var,
-               int         reverse_channel,
-               int                   trans
-    )
-{
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
-    vxc_uchar16 Y, U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-    vxc_uchar16 dst;
-
-    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    var *= outputScale;
-    float4  paramData = (float4)(bMean * var - zp, gMean * var - zp,\
-        rMean * var - zp, var);
-    half4 paramData_f16;
-    _viv_asm(CONV, paramData_f16, paramData);
-
-    //C = Y - 16;
-    //D = U - 128;
-    //E = V - 128;
-    // calculate R
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
-
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
-
-    // calculate G
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG0, tmpDstG1;
-    VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8);
-    VXC_DP4x4(dst, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
-    VXC_DP4x4(dst, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
-    VXC_DP4x4(dst, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
-    VXC_DP4x4(dst, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
-
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
-
-    // calculate B
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
-
-    // reorder to bgr
-    vxc_uchar8 tmpdst0, tmpdst1;
-    vxc_uchar16 dst0, dst1, dst2;
-
-    if(bOrder == 2)
-    {
-        vxc_uchar16 exchangeData = B;
-        B = R;
-        R = exchangeData;
-    }
-
-    // BGR BGR BG
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);
-    VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);
-
-    // RBG RBG RB
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);
-    VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);
-
-    pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);
-
-    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.x += 16;
-
-    // GRB GRB GR
-    VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);
-    VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);
-
-    // BGR BGR BG
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);
-    VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);
-
-    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.x += 16;
-
-    // RBG RBG RB
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);
-    VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);
-
-    // GRB GRB GR
-    VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);
-    VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);
-
-    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx
deleted file mode 100644
index 8217d2f..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx
+++ /dev/null
@@ -1,196 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
-
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
-
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
-
-_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;
-_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-_viv_uniform int zp;
-_viv_uniform float outputScale;
-
-#define IMAGE_PRE_PROCESS_YUV444_TRANS(dst_name, dst_type) \
-__kernel void pre_process_yuv444_trans_U8to##dst_name( \
-    __read_only image2d_t y_img, __read_only image2d_t u_img, \
-    __read_only image2d_t v_img, __write_only image2d_t    output, \
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \
-{ \
-    int4 gidx = get_global_id(0); \
-    int gidy = get_global_id(1); \
-    gidx += (int4)(0, 1, 2, 3); \
- \
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \
-    int4 sx = fx & 0xffff8000;  \
-    int fy, sy; \
-    fx -= sx; \
-    sx = sx >> 15; \
-    fx = (fx +(1 << 4)) >> 5; \
- \
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \
-    sy = fy & 0xffff8000;  \
-    fy -= sy; \
-    sy = sy >> 15; \
- \
-    sy = sy < 0 ? 0 : sy; \
-    fy = fy < 0 ? 0 : fy; \
- \
-    fy = (fy + (1<< 4)) >> 5; \
-    sx += (*xOffset); \
-    sy += (*yOffset); \
-    int2 srcPos = (int2)(sx.x, sy); \
- \
-    vxc_uchar16 Y, U, V; \
-    vxc_int4 C0, C1, C2, C3; \
-    vxc_uchar16 R, G, B; \
- \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
- \
-    srcPos.x = sx.y; \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
- \
-    srcPos.x = sx.z; \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
- \
-    srcPos.x = sx.w; \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
- \
-    int tmpV = -56992; \
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
- \
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \
- \
-    ushort tmpG = 34784; \
-    vxc_ushort8 tmpDstG, tmpDstG1; \
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
- \
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \
-    tmpV = -70688; \
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
- \
-    int4 result, temp1, temp2, dstR, dstG, dstB; \
-    int4 tmpData0, tmpData1; \
- \
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
-    temp1 = fx * tmpData0 + tmpData1; \
- \
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
-    temp2 = fx * tmpData0 + tmpData1; \
-    result = fy * temp2 + (temp1 << 10); \
- \
-    tmpV = 1 << 19; \
-    dst_type dst, tmpPack; \
-    float4 tmpDst; \
- \
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - bMean) * var; \
-    dstB = convert_int4_rte(tmpDst * outputScale + zp); \
- \
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
-    temp1 = fx * tmpData0 + tmpData1; \
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
-    temp2 = fx * tmpData0 + tmpData1; \
-    result = fy * temp2 + (temp1 << 10); \
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - gMean) * var; \
-    dstG = convert_int4_rte(tmpDst * outputScale + zp); \
- \
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
-    temp1 = fx * tmpData0 + tmpData1; \
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
-    temp2 = fx * tmpData0 + tmpData1; \
-    result = fy * temp2 + (temp1 << 10); \
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - rMean) * var; \
-    dstR = convert_int4_rte(tmpDst * outputScale + zp); \
- \
-    if(bOrder == 2) \
-    { \
-        int4 exchangeData = dstB; \
-        dstB = dstR; \
-        dstR = exchangeData; \
-    } \
- \
-    VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); \
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); \
- \
-    int2 dstPos = (int2)(get_global_id(0) * 3, gidy); \
-    VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \
-}
-IMAGE_PRE_PROCESS_YUV444_TRANS(U8, vxc_uchar16)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx
index cd56af5..8f7826b 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx
@@ -28,37 +28,34 @@ __kernel void resize_bilinear_BF16toBF16_DOWN
     float  top_y_f      = floor(in_y);
     float  y_lerp       = in_y - top_y_f;
     int    top_y_idx    = convert_int(top_y_f);
-    int    bottom_y_idx = top_y_idx + 1;
     vxc_short8 top;
     vxc_short8 bottom;
     vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(top, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
 
     vxc_ushort8 src;
     float4 left4;
@@ -84,7 +81,14 @@ __kernel void resize_bilinear_BF16toBF16_DOWN
     vxc_ushort8 tmp, dst;
     _viv_asm(COPY, tmp, dst4, 16);
     dst.s0123 = tmp.s1357;
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 __kernel void resize_bilinear_BF16toBF16_UP
@@ -107,22 +111,24 @@ __kernel void resize_bilinear_BF16toBF16_UP
     float  top_y_f     = floor(in_y);
     float  y_lerp      = in_y - top_y_f;
     int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
     vxc_ushort8 src0, src1, src2, src3, dst0, dst1;
     vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
-    coord_in.y = bottom_y_idx;
-    VXC_ReadImage2DArray(src2, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
     vxc_ushort8 bitextract_p0;
     vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
@@ -132,29 +138,36 @@ __kernel void resize_bilinear_BF16toBF16_UP
     VXC_DP2x8(maskShift, bitextract_p0, constData, \
     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
 
-    do
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 left4;
+    float4 right4;
+    float4 top4;
+    float4 bottom4;
+
+    int loop = depth - 1;
+    while (coord_in.z < loop)
     {
         VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
         VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
-        coord_in.z ++;
-        coord_in.y = top_y_idx;
-        VXC_ReadImage2DArray(src0, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src1, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
-        coord_in.y = bottom_y_idx;
-        VXC_ReadImage2DArray(src2, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src3, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.w += input_desc.s4;
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.z ++;
 
         vxc_ushort8 dst_tmp;
-        float4 left4;
-        float4 right4;
-        float4 top4;
-        float4 bottom4;
+
 
         VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
         _viv_asm(COPY, left4, dst_tmp, 16);
@@ -176,7 +189,30 @@ __kernel void resize_bilinear_BF16toBF16_UP
         vxc_ushort8 tmp, dst;
         _viv_asm(COPY, tmp, dst4, 16);
         dst.s0123 = tmp.s1357;
-        VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
-        coord_out.z ++;
-    } while (coord_in.z < depth);
+
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_out.w += output_desc.s4;
+    }
+
+    VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    vxc_ushort8 dst_tmp;
+    VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, left4, dst_tmp, 16);
+    VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, right4, dst_tmp, 16);
+    right4     -= left4;
+    top4        = right4 * x_lerp + left4;
+    VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, left4, dst_tmp, 16);
+    VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, right4, dst_tmp, 16);
+    right4     -= left4;
+    bottom4     = right4 * x_lerp + left4;
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+    vxc_ushort8 tmp, dst;
+    _viv_asm(COPY, tmp, dst4, 16);
+    dst.s0123 = tmp.s1357;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx
index f910d21..463b5a2 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx
@@ -1,7 +1,7 @@
 #include "cl_viv_vx_ext.h"
 
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;
-_viv_uniform VXC_512Bits uniFp16toFp32_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_left_4x4;
 _viv_uniform VXC_512Bits uniRightSubLeft_4x4;
 _viv_uniform VXC_512Bits uniExtactHalf8_2x8;
 _viv_uniform float2 scale_xy;
@@ -27,94 +27,66 @@ __kernel void resize_bilinear_F16toF16_DOWN
     float4 left_x_f    = floor(in_x);
     float4 x_lerp      = in_x - left_x_f;
     int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
     float  top_y_f     = floor(in_y);
     float  y_lerp      = in_y - top_y_f;
     int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-    vxc_short8 top_left0, top_right0;
-    vxc_short8 bottom_left0, bottom_right0;
-    vxc_half8 top_left, top_right;
-    vxc_half8 bottom_left, bottom_right;
+    vxc_short8 top_short, bottom_short, dst;
+    vxc_half8  top, bottom, result;
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, top_left, top_left0, 16);
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, top_right, top_right0, 16);
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, bottom_left, bottom_left0, 16);
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, bottom_right, bottom_right0, 16);
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, top,    top_short, 16);
+    _viv_asm(COPY, bottom, bottom_short, 16);
 
     float4 left4;
     float4 right4;
     float4 top4;
     float4 bottom4;
 
-    VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-    VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
-    top4        = right4 * x_lerp + left4;
-    VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-    VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
-    bottom4      = right4 * x_lerp + left4;
-    bottom4     -= top4;
-    float4 dst4  = bottom4 * y_lerp + top4;
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    top4       = right4 * x_lerp + left4;
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    bottom4    = right4 * x_lerp + left4;
+    bottom4   -= top4;
+    float4 dst4       = bottom4 * y_lerp + top4;
+
     half4 tmp;
     _viv_asm(CONV, tmp, dst4);
-    VXC_DP2x8(top_left, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
-    _viv_asm(COPY, top_left0, top_left, 16);
-    VXC_WriteImage2DArray(output, coord_out, top_left0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    VXC_DP2x8(result, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+    _viv_asm(COPY, dst, result, 16);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 __kernel void resize_bilinear_F16toU8_DOWN
@@ -131,84 +103,50 @@ __kernel void resize_bilinear_F16toU8_DOWN
     float4 left_x_f    = floor(in_x);
     float4 x_lerp      = in_x - left_x_f;
     int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
     float  top_y_f     = floor(in_y);
     float  y_lerp      = in_y - top_y_f;
     int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-    vxc_short8 top_left0, top_right0;
-    vxc_short8 bottom_left0, bottom_right0;
-    vxc_half8 top_left, top_right;
-    vxc_half8 bottom_left, bottom_right;
+
+    vxc_short8 top_short, bottom_short;
+    vxc_half8  top, bottom;
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, top_left, top_left0, 16);
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, top,    top_short, 16);
+    _viv_asm(COPY, bottom, bottom_short, 16);
 
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, top_right, top_right0, 16);
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, bottom_left, bottom_left0, 16);
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, bottom_right, bottom_right0, 16);
     float4 left4;
     float4 right4;
     float4 top4;
     float4 bottom4;
-    VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-    VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
     top4        = right4 * x_lerp + left4;
-    VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-    VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
     bottom4      = right4 * x_lerp + left4;
     bottom4     -= top4;
     float4 dst4  = bottom4 * y_lerp + top4;
@@ -216,7 +154,14 @@ __kernel void resize_bilinear_F16toU8_DOWN
     int4 dst     = convert_int4_rte(dst4);
     vxc_uchar8 dst_uchar;
     VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-    VXC_WriteImage2DArray(output, coord_out, dst_uchar, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_uchar,
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 __kernel void resize_bilinear_F16toF16_UP
@@ -239,24 +184,26 @@ __kernel void resize_bilinear_F16toF16_UP
     float  top_y_f     = floor(in_y);
     float  y_lerp      = in_y - top_y_f;
     int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
+
 
     vxc_ushort8 src0, src1, src2, src3, dst0, dst1;
     vxc_half8 top;
     vxc_half8 bottom;
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
 
-    coord_in.y = bottom_y_idx;
-    VXC_ReadImage2DArray(src2, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
     vxc_ushort8 bitextract_p0;
     vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
@@ -266,32 +213,41 @@ __kernel void resize_bilinear_F16toF16_UP
     VXC_DP2x8(maskShift, bitextract_p0, constData, \
     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
 
-    do
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 left4;
+    float4 right4;
+    float4 top4;
+    float4 bottom4;
+
+    int loop = depth - 1;
+    while (coord_in.z < loop)
     {
         VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
         VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
         _viv_asm(COPY, top, dst0, 16);
         _viv_asm(COPY, bottom, dst1, 16);
+
+
+        coord_in.w += input_desc.s4;
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
         coord_in.z ++;
-        coord_in.y = top_y_idx;
-        VXC_ReadImage2DArray(src0, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src1, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_in.y = bottom_y_idx;
-        VXC_ReadImage2DArray(src2, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src3, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        float4 left4;
-        float4 right4;
-        float4 top4;
-        float4 bottom4;
-        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
         top4        = right4 * x_lerp + left4;
-        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
-        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
         bottom4      = right4 * x_lerp + left4;
         bottom4     -= top4;
         float4 dst4  = bottom4 * y_lerp + top4;
@@ -299,7 +255,28 @@ __kernel void resize_bilinear_F16toF16_UP
         _viv_asm(CONV, tmp, dst4);
         VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
         _viv_asm(COPY, dst0, top, 16);
-        VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
-        coord_out.z ++;
-    } while (coord_in.z < depth);
+
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_out.w += output_desc.s4;
+    }
+
+    VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, top, dst0, 16);
+    _viv_asm(COPY, bottom, dst1, 16);
+
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    top4        = right4 * x_lerp + left4;
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    bottom4      = right4 * x_lerp + left4;
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+    half4 tmp;
+    _viv_asm(CONV, tmp, dst4);
+    VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+    _viv_asm(COPY, dst0, top, 16);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx
index 7f5b21f..bdfa3fb 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx
@@ -5,8 +5,8 @@ _viv_uniform float2 scale_xy;
 _viv_uniform int depth;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
 _viv_uniform float dfpScale;
 _viv_uniform float half_pixel_value;
 
@@ -34,8 +34,6 @@ __kernel void resize_bilinear_I16toI16_UP
     float  top_y_f     = floor(in_y);
     float  y_lerp      = in_y - top_y_f;
     int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
 
     vxc_ushort8 src0, src1, src2, src3, dst0, dst1;
 
@@ -44,16 +42,19 @@ __kernel void resize_bilinear_I16toI16_UP
 
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
 
-    coord_in.y = bottom_y_idx;
-    VXC_ReadImage2DArray(src2, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
     vxc_ushort8 bitextract_p0;
     vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
@@ -63,39 +64,42 @@ __kernel void resize_bilinear_I16toI16_UP
     VXC_DP2x8(maskShift, bitextract_p0, constData, \
     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
 
-    do
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 left4;
+    float4 right4;
+    float4 top4;
+    float4 bottom4;
+
+    int loop = depth - 1;
+    while (coord_in.z < loop)
     {
         VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
         VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
         _viv_asm(COPY, top, dst0, 16);
         _viv_asm(COPY, bottom, dst1, 16);
-
-        float4 left4;
-        float4 right4;
-        float4 top4;
-        float4 bottom4;
-
+        coord_in.w += input_desc.s4;
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
         coord_in.z ++;
-        coord_in.y = top_y_idx;
-        VXC_ReadImage2DArray(src0, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src1, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
-        coord_in.y = bottom_y_idx;
-        VXC_ReadImage2DArray(src2, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage2DArray(src3, input, coord_in, \
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
         top4        = right4 * x_lerp + left4;
 
         VXC_DP4x4(left4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
         VXC_DP4x4(right4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
         bottom4      = right4 * x_lerp + left4;
         bottom4     -= top4;
         float4 dst4  = bottom4 * y_lerp + top4;
@@ -103,10 +107,30 @@ __kernel void resize_bilinear_I16toI16_UP
         int4 dst     = convert_int4_rte(dst4);
 
         VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-        VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 
-        coord_out.z ++;
-    } while (coord_in.z < depth);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_out.w += output_desc.s4;
+    }
+
+    VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, top, dst0, 16);
+    _viv_asm(COPY, bottom, dst1, 16);
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    top4        = right4 * x_lerp + left4;
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    bottom4      = right4 * x_lerp + left4;
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+    dst4         = dst4 * dfpScale;
+    int4 dst     = convert_int4_rte(dst4);
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
 }
 
 __kernel void resize_bilinear_I16toI16_DOWN
@@ -125,103 +149,67 @@ __kernel void resize_bilinear_I16toI16_DOWN
     float4 left_x_f    = floor(in_x);
     float4 x_lerp      = in_x - left_x_f;
     int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
-
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
-
     float  top_y_f     = floor(in_y);
     float  y_lerp      = in_y - top_y_f;
     int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-
-    vxc_short8 top_left, top_right;
-    vxc_short8 bottom_left, bottom_right;
 
+    vxc_short8 top, bottom, result;
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
 
     float4 left4;
     float4 right4;
     float4 top4;
     float4 bottom4;
 
-    VXC_DP4x4(left4, top_left, top_left, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-    VXC_DP4x4(right4, top_right, top_right, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, top, top, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
     top4        = right4 * x_lerp + left4;
 
-    VXC_DP4x4(left4, bottom_left, bottom_left, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-    VXC_DP4x4(right4, bottom_right, bottom_right, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
     bottom4      = right4 * x_lerp + left4;
-
     bottom4     -= top4;
     float4 dst4  = bottom4 * y_lerp + top4;
-
     dst4         = dst4 * dfpScale;
-
     int4 dst     = convert_int4_rte(dst4);
 
-    VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-    VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx
index aebf873..0be6cc5 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx
@@ -5,8 +5,8 @@ _viv_uniform float2 scale_xy;
 _viv_uniform int depth;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
 _viv_uniform float dfpScale;
 _viv_uniform float half_pixel_value;
 
@@ -34,8 +34,6 @@ __kernel void resize_bilinear_I8toI8_UP
     float  top_y_f     = floor(in_y);
     float  y_lerp      = in_y - top_y_f;
     int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
 
     vxc_uchar16 src0, src1, dst0, dst1;
 
@@ -44,12 +42,15 @@ __kernel void resize_bilinear_I8toI8_UP
 
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
 
-    coord_in.y = bottom_y_idx;
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 
     vxc_ushort8 bitextract_p0;
     vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
@@ -59,37 +60,42 @@ __kernel void resize_bilinear_I8toI8_UP
     VXC_DP2x8(maskShift, bitextract_p0, constData, \
     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
 
-    do
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 left4;
+    float4 right4;
+    float4 top4;
+    float4 bottom4;
+
+    int loop = depth - 1;
+    while (coord_in.z < loop)
     {
         VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
         VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
         _viv_asm(COPY, top, dst0, 16);
         _viv_asm(COPY, bottom, dst1, 16);
 
+        coord_in.w += input_desc.s4;
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
         coord_in.z ++;
-        coord_in.y = top_y_idx;
-        VXC_ReadImage2DArray(src0, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        coord_in.y = bottom_y_idx;
-        VXC_ReadImage2DArray(src1, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-        float4 left4;
-        float4 right4;
-        float4 top4;
-        float4 bottom4;
 
         VXC_DP4x4(left4, top, top, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
         VXC_DP4x4(right4, top, top, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
 
         top4        = right4 * x_lerp + left4;
 
         VXC_DP4x4(left4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
         VXC_DP4x4(right4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
 
         bottom4      = right4 * x_lerp + left4;
         bottom4     -= top4;
@@ -97,10 +103,31 @@ __kernel void resize_bilinear_I8toI8_UP
         dst4         = dst4 * dfpScale;
         int4 dst     = convert_int4_rte(dst4);
         VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-        VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 
-        coord_out.z ++;
-    } while (coord_in.z < depth);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_out.w += output_desc.s4;
+    }
+
+    VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, top, dst0, 16);
+    _viv_asm(COPY, bottom, dst1, 16);
+    VXC_DP4x4(left4, top, top, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    top4        = right4 * x_lerp + left4;
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    bottom4      = right4 * x_lerp + left4;
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+    dst4         = dst4 * dfpScale;
+    int4 dst     = convert_int4_rte(dst4);
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 __kernel void resize_bilinear_I8toI8_DOWN
@@ -112,98 +139,55 @@ __kernel void resize_bilinear_I8toI8_DOWN
     )
 {
     int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
     int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
     float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;
-
     float4 left_x_f    = floor(in_x);
     float4 x_lerp      = in_x - left_x_f;
     int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
-
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
-
     float  top_y_f     = floor(in_y);
     float  y_lerp      = in_y - top_y_f;
     int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-
-    vxc_char16 top_left, top_right;
-    vxc_char16 bottom_left, bottom_right;
-
+    vxc_char16 top, bottom, result;
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
 
     float4 left4;
     float4 right4;
     float4 top4;
     float4 bottom4;
 
-    VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-    VXC_DP4x4(right4, top_right, top_right, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
     top4        = right4 * x_lerp + left4;
 
-    VXC_DP4x4(left4, bottom_left, bottom_left, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-    VXC_DP4x4(right4, bottom_right, bottom_right, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
     bottom4      = right4 * x_lerp + left4;
 
     bottom4     -= top4;
@@ -213,6 +197,11 @@ __kernel void resize_bilinear_I8toI8_DOWN
 
     int4 dst     = convert_int4_rte(dst4);
 
-    VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-    VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx
index 4c21bd7..39f239a 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx
@@ -1,13 +1,13 @@
 #include "cl_viv_vx_ext.h"
 
-_viv_uniform VXC_512Bits uniU8SubZPtoFp32_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;
 _viv_uniform float2 scale_xy;
 _viv_uniform int depth;
 _viv_uniform int input_ZP;
 _viv_uniform float uint8Scale;
 _viv_uniform float output_ZP;
-_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
 _viv_uniform float half_pixel_value;
@@ -26,69 +26,36 @@ __kernel void resize_bilinear_U8toF16_DOWN
     float4 left_x_f    = floor(in_x);
     float4 x_lerp      = in_x - left_x_f;
     int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
     float  top_y_f     = floor(in_y);
     float  y_lerp      = in_y - top_y_f;
     int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-    vxc_uchar16 top_left, top_right;
-    vxc_uchar16 bottom_left, bottom_right;
+    vxc_uchar16 top, bottom;
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
 
     float4 left4;
     float4 right4;
@@ -97,16 +64,12 @@ __kernel void resize_bilinear_U8toF16_DOWN
 
     unsigned char inputZP;
     _viv_asm(COPY, inputZP, input_ZP, 4);
-    VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-    VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     top4        = right4 * x_lerp + left4;
 
-    VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-    VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     bottom4      = right4 * x_lerp + left4;
 
     bottom4     -= top4;
@@ -120,7 +83,12 @@ __kernel void resize_bilinear_U8toF16_DOWN
     vxc_short8 dst_short;
     _viv_asm(COPY, dst_short, dst, 16);
 
-    VXC_WriteImage2DArray(output, coord_out, dst_short.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short.s0246,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 __kernel void resize_bilinear_U8toU8_UP
@@ -147,8 +115,6 @@ __kernel void resize_bilinear_U8toU8_UP
     float  top_y_f     = floor(in_y);
     float  y_lerp      = in_y - top_y_f;
     int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
 
     vxc_uchar16 src0, src1;
 
@@ -157,12 +123,15 @@ __kernel void resize_bilinear_U8toU8_UP
 
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
 
-    coord_in.y = bottom_y_idx;
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 
     vxc_ushort8 bitextract_p0;
     vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
@@ -172,46 +141,67 @@ __kernel void resize_bilinear_U8toU8_UP
     VXC_DP2x8(maskShift, bitextract_p0, constData, \
     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
 
-    do
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 left4;
+    float4 right4;
+    float4 top4;
+    float4 bottom4;
+
+    int loop = depth - 1;
+    while (coord_in.z < loop)
     {
         VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
         VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
+        coord_in.w += input_desc.s4;
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
         coord_in.z ++;
-        coord_in.y = top_y_idx;
-        VXC_ReadImage2DArray(src0, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        coord_in.y = bottom_y_idx;
-        VXC_ReadImage2DArray(src1, input, coord_in, \
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-        float4 left4;
-        float4 right4;
-        float4 top4;
-        float4 bottom4;
-
         unsigned char inputZP;
         _viv_asm(COPY, inputZP, input_ZP, 4);
-        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
-
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
         top4        = right4 * x_lerp + left4;
 
         VXC_DP4x4(left4, bottom, inputZP, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
         VXC_DP4x4(right4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
-
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
         bottom4      = right4 * x_lerp + left4;
         bottom4     -= top4;
         float4 dst4  = bottom4 * y_lerp + top4;
         dst4         = dst4 * uint8Scale + output_ZP;
         int4 dst     = convert_int4_rte(dst4);
         VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-        VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_out.w += output_desc.s4;
+    }
 
-        coord_out.z ++;
-    } while (coord_in.z < depth);
+    VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    unsigned char inputZP;
+    _viv_asm(COPY, inputZP, input_ZP, 4);
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    top4        = right4 * x_lerp + left4;
+
+    VXC_DP4x4(left4, bottom, inputZP, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    bottom4      = right4 * x_lerp + left4;
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+    dst4         = dst4 * uint8Scale + output_ZP;
+    int4 dst     = convert_int4_rte(dst4);
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 __kernel void resize_bilinear_U8toU8_DOWN
@@ -228,69 +218,36 @@ __kernel void resize_bilinear_U8toU8_DOWN
     float4 left_x_f    = floor(in_x);
     float4 x_lerp      = in_x - left_x_f;
     int4   left_x_idx  = convert_int4(left_x_f);
-    float4 right_x_f   = ceil(in_x);
-    int4   right_x_idx = convert_int4(right_x_f);
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
     float  top_y_f     = floor(in_y);
     float  y_lerp      = in_y - top_y_f;
     int    top_y_idx   = convert_int(top_y_f);
-    float  bottom_y_f  = ceil(in_y);
-    int    bottom_y_idx= convert_int(bottom_y_f);
-    vxc_uchar16 top_left, top_right;
-    vxc_uchar16 bottom_left, bottom_right;
+    vxc_uchar16 top, bottom, result;
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
     coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(top_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(top_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.y = bottom_y_idx;
-    coord_in.x = left_x_idx.x;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    coord_in.x = right_x_idx.x;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.y;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.z;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = right_x_idx.w;
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
 
     float4 left4;
     float4 right4;
@@ -299,25 +256,26 @@ __kernel void resize_bilinear_U8toU8_DOWN
 
     unsigned char inputZP;
     _viv_asm(COPY, inputZP, input_ZP, 4);
-    VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-    VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     top4        = right4 * x_lerp + left4;
 
-    VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-    VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
-
-    right4      -= left4;
+    VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     bottom4      = right4 * x_lerp + left4;
 
     bottom4     -= top4;
     float4 dst4  = bottom4 * y_lerp + top4;
 
     dst4         = dst4 * uint8Scale + output_ZP;
-
     int4 dst     = convert_int4_rte(dst4);
 
-    VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-    VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx
index 640560e..59e8211 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx
@@ -69,7 +69,8 @@ __kernel void resize_bilinear_U8toU8_UP_opt
     baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
     _viv_asm(MOV, coord_out.w, baseAddr);
 
-    do
+    int loop = depth - 1;
+    while (coord_in.z < loop)
     {
         VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
         VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
@@ -88,8 +89,17 @@ __kernel void resize_bilinear_U8toU8_UP_opt
                 VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
         coord_out.w += output_desc.s4;
 
-        coord_out.z ++;
-    } while (coord_out.z < depth);
+        coord_in.z ++;
+    }
+
+    VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
+    vxc_uchar16 dst;
+    VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom,
+            VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+            VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx
index 9d2838c..7172017 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx
@@ -28,18 +28,30 @@ __kernel void resize_nearest_F16toF16
     vxc_short8 src;
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.y;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.z;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.w;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 _viv_uniform VXC_512Bits uniGetExtractData_2x8;
@@ -56,18 +68,29 @@ __kernel void resize_nearest_F16toF16_op
     vxc_ushort8 src0, src1, dst;
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    //in_x_idx = in_x_idx - in_x_idx.xxxx;
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
     vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);
     vxc_ushort8 input_idx;
     _viv_asm(COPY, input_idx, in_x_idx, 16);
     VXC_DP2x8(mask, input_idx, input_idx, \
     VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);
     VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 _viv_uniform VXC_512Bits uniConvertI8toI8_2x8;
@@ -84,19 +107,31 @@ __kernel void resize_nearest_I8toI8
     vxc_char16 src;
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.y;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.z;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.w;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
     VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 __kernel void resize_nearest_I8toI8_op
@@ -113,8 +148,14 @@ __kernel void resize_nearest_I8toI8_op
     vxc_char16 dst;
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
     vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);
     vxc_ushort8 input_idx;
     _viv_asm(COPY, input_idx, in_x_idx, 16);
@@ -123,7 +164,13 @@ __kernel void resize_nearest_I8toI8_op
     VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
     _viv_asm(COPY, dst, dst0, 8);
     VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 __kernel void resize_nearest_U8toU8
@@ -139,22 +186,34 @@ __kernel void resize_nearest_U8toU8
     vxc_uchar16 src;
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.y;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.z;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.w;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
     vxc_ushort8 multiplier;
     _viv_asm(COPY, multiplier, multAndoutZP, 16);
     VXC_DP2x8(src, src, multiplier, \
     VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 __kernel void resize_nearest_U8toU8_op
@@ -170,8 +229,14 @@ __kernel void resize_nearest_U8toU8_op
     vxc_uchar16 src0, dst;
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
     vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);
     vxc_ushort8 input_idx;
     _viv_asm(COPY, input_idx, in_x_idx, 16);
@@ -180,7 +245,13 @@ __kernel void resize_nearest_U8toU8_op
     vxc_ushort8 multiplier;
     _viv_asm(COPY, multiplier, multAndoutZP, 16);
     VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 __kernel void resize_nearest_I16toI16
@@ -196,19 +267,32 @@ __kernel void resize_nearest_I16toI16
     vxc_short8 src;
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
 
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.y;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.z;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
     coord_in.x = in_x_idx.w;
-    VXC_ReadImage2DArray(src, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
     VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
 __kernel void resize_nearest_I16toI16_op
@@ -224,10 +308,16 @@ __kernel void resize_nearest_I16toI16_op
     vxc_ushort8 src0, src1, dst0;
     vxc_short8 dst;
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
-    VXC_ReadImage2DArray(src0, input, coord_in, \
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord_in, \
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
     //in_x_idx = in_x_idx - in_x_idx.xxxx;
     vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);
@@ -237,5 +327,11 @@ __kernel void resize_nearest_I16toI16_op
     VXC_BitExtract(dst0, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
     _viv_asm(COPY, dst, dst0, 8);
     VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/space2depth_internal.vx b/src/tim/vx/internal/src/libnnext/ops/vx/space2depth_internal.vx
new file mode 100644
index 0000000..b8bb334
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/space2depth_internal.vx
@@ -0,0 +1,135 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniExtractEvenUint8Stride2_2x8;
+_viv_uniform VXC_512Bits uniExtractOddUint8Stride2_2x8;
+_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4;
+_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4;
+
+_viv_uniform int input_depth;
+
+#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT(src0_type_name, src1_type_name, read_type) \
+__kernel void space2depth_internal_##src0_type_name##to##src1_type_name( \
+    image2d_array_t input, \
+    image2d_array_t output, \
+    int block_size_x, \
+    int block_size_y \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    read_type src; \
+    VXC_ReadImage2DArray(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ \
+    ushort stride_x = (ushort)block_size_x; \
+    ushort stride_y = (ushort)block_size_y; \
+    ushort sidx = (ushort)gidx; \
+    ushort sidy = (ushort)gidy; \
+    ushort tmpX = sidx % stride_x; \
+    ushort tmpY = sidy % stride_y; \
+    int tmpId0 = tmpX; \
+    int tmpId1 = tmpY; \
+    int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0); \
+    coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz; \
+    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+}
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT(U8, U8, vxc_uchar16)
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I8, I8, vxc_char16)
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I16, I16, vxc_short8)
+
+__kernel void space2depth_internal_F16toF16(
+    image2d_array_t input,
+    image2d_array_t output,
+    int block_size_x,
+    int block_size_y
+    )
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    vxc_short8 data, imgVal0;
+    VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    ushort stride_x = (ushort)block_size_x;
+    ushort stride_y = (ushort)block_size_y;
+    ushort sidx = (ushort)gidx;
+    ushort sidy = (ushort)gidy;
+    ushort tmpX = sidx % stride_x;
+    ushort tmpY = sidy % stride_y;
+    int tmpId0 = tmpX;
+    int tmpId1 = tmpY;
+    int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0);
+    coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz;
+
+    VXC_WriteImage2DArray(output, coord_out, data, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \
+__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \
+    image2d_array_t input, \
+    image2d_array_t output, \
+    int block_size_x, \
+    int block_size_y \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+ \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \
+    int out_d1; \
+    read_type imageData; \
+    write_type  imgVal0, imgVal1; \
+ \
+    VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                     VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    out_d1 = gidz + input_depth; \
+ \
+    VXC_DP2x8(imgVal0, imageData, imageData,\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractEvenUint8Stride2_2x8); \
+    VXC_DP2x8(imgVal1, imageData, imageData,\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddUint8Stride2_2x8); \
+    VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord_out.z = out_d1; \
+    VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(U8, U8, vxc_uchar16, vxc_uchar16)
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(I8, I8, vxc_char16, vxc_char16)
+
+#define SPACE2DEPTH_INTERNAL_16BITS_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \
+__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \
+    image2d_array_t input, \
+    image2d_array_t output, \
+    int block_size_x, \
+    int block_size_y \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+ \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \
+    int out_d1; \
+    read_type imageData; \
+    write_type  imgVal0, imgVal1; \
+ \
+    VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    out_d1 = gidz + input_depth; \
+    VXC_DP4x4(imgVal0, imageData, imageData, \
+                 VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractEvenFp16Stride2_4x4); \
+    VXC_DP4x4(imgVal1, imageData, imageData, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractOddFp16Stride2_4x4); \
+    VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_out.z = out_d1; \
+    VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+SPACE2DEPTH_INTERNAL_16BITS_X2Y1(I16, I16, vxc_short8, vxc_short8)
+SPACE2DEPTH_INTERNAL_16BITS_X2Y1(F16, F16, vxc_short8, vxc_short8)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx
new file mode 100644
index 0000000..efc9266
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx
@@ -0,0 +1,58 @@
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvertDatatoF32_4x4;
+_viv_uniform float output_scale;
+_viv_uniform float tail;
+
+#define UPSAMPLE_SCALETO_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type, conv_func) \
+    __kernel void upsamplescale_##src_name##to##dst_name( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 int              stride, \
+                 float            scale) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    read_type  read_val; \
+    src_type   src_val; \
+    dst_type   dst_val; \
+    write_type write_val; \
+    VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src_val, read_val, 16); \
+    coord.xy *= stride; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.w, baseAddr); \
+    float4 data; \
+    VXC_DP4x4(data, src_val, src_val, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertDatatoF32_4x4); \
+    data = data * output_scale + tail; \
+    _viv_asm(conv_func, dst_val, data); \
+    _viv_asm(COPY, write_val, dst_val, 16); \
+    int4 coord_out = coord; \
+    for (int y = 0; y < stride; y++) \
+    { \
+        coord_out.x = coord.x; \
+        for (int x = 0; x < stride; ) \
+        { \
+            VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, write_val, \
+                VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); \
+            x++; \
+            coord_out.x ++; \
+        } \
+        coord_out.y ++; \
+    } \
+}
+
+UPSAMPLE_SCALETO_FUN(F16, F16,  vxc_short8,  vxc_half8,   half4,  short4, CONV)
+UPSAMPLE_SCALETO_FUN(F16, I16,  vxc_short8,  vxc_half8,   int4,   short4, CONV_RTE)
+UPSAMPLE_SCALETO_FUN(F16, I8,   vxc_short8,  vxc_half8,   int4,   char4,  CONV_RTE)
+UPSAMPLE_SCALETO_FUN(F16, U8,   vxc_short8,  vxc_half8,   int4,   uchar4, CONV_RTE)
+UPSAMPLE_SCALETO_FUN(I16, I16,  vxc_short8,  vxc_short8,  int4,   short4, CONV_RTE)
+UPSAMPLE_SCALETO_FUN(I16, F16,  vxc_short8,  vxc_short8,  half4,  short4, CONV)
+UPSAMPLE_SCALETO_FUN(I8,  I8,   vxc_char16,  vxc_char16,  int4,   char4,  CONV_RTE)
+UPSAMPLE_SCALETO_FUN(I8,  F16,  vxc_short8,  vxc_short8,  half4,  short4, CONV)
+UPSAMPLE_SCALETO_FUN(U8,  U8,   vxc_uchar16, vxc_uchar16, int4,   uchar4, CONV_RTE)
+UPSAMPLE_SCALETO_FUN(U8,  F16,  vxc_short8,  vxc_short8,  half4,  short4, CONV)
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx
new file mode 100644
index 0000000..d1935b1
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx
@@ -0,0 +1,83 @@
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniUpScale2X_lo_2x8;
+_viv_uniform VXC_512Bits uniUpScale2X_hi_2x8;
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
+
+#define UPSAMPLE_SCALETO8B_FUN(src_name, dst_name, read_type, src_type, dst_type) \
+    __kernel void upsamplescale_##src_name##to##dst_name##_K2( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 int              stride, \
+                 float            scale) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    read_type  read_val; \
+    src_type   src_val; \
+    dst_type   dst_val; \
+    VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src_val, read_val, 16); \
+    coord.xy <<= 1; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.w, baseAddr); \
+    vxc_ushort8 multiplier; \
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+    VXC_DP2x8(dst_val, src_val, multiplier, \
+          VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_lo_2x8); \
+    VXC_DP2x8(dst_val, src_val, multiplier, \
+          VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_hi_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+    coord.y ++; \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+}
+
+UPSAMPLE_SCALETO8B_FUN(F16, I8,  vxc_short8,  vxc_half8,   vxc_char16)
+UPSAMPLE_SCALETO8B_FUN(F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16)
+UPSAMPLE_SCALETO8B_FUN(I8,  I8,  vxc_char16,  vxc_char16,  vxc_char16)
+UPSAMPLE_SCALETO8B_FUN(U8,  U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16)
+
+#define UPSAMPLE_SCALETO16B_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type) \
+    __kernel void upsamplescale_##src_name##to##dst_name##_K2( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 int              stride, \
+                 float            scale) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    read_type  read_val; \
+    src_type   src_val; \
+    dst_type   dst0_val; \
+    dst_type   dst1_val; \
+    write_type write_val; \
+    VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src_val, read_val, 16); \
+    coord.xy <<= 1; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.w, baseAddr); \
+    vxc_ushort8 multiplier; \
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+    VXC_DP2x8(dst0_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_lo_2x8); \
+    VXC_DP2x8(dst1_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_hi_2x8); \
+    _viv_asm(COPY, write_val, dst0_val, 16); \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    coord.y ++; \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, write_val, dst1_val, 16); \
+    coord.xy = coord.xy + (int2)(8, -1); \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    coord.y ++; \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+UPSAMPLE_SCALETO16B_FUN(F16, F16,  vxc_short8,  vxc_half8,   vxc_half8,  vxc_short8)
+UPSAMPLE_SCALETO16B_FUN(F16, I16,  vxc_short8,  vxc_half8,   vxc_short8, vxc_short8)
+UPSAMPLE_SCALETO16B_FUN(I8,  F16,  vxc_char16,  vxc_char16,  vxc_half8,  vxc_short8)
+UPSAMPLE_SCALETO16B_FUN(U8,  F16,  vxc_uchar16, vxc_uchar16, vxc_half8,  vxc_short8)
+UPSAMPLE_SCALETO16B_FUN(I16, F16,  vxc_short8,  vxc_short8,  vxc_half8,  vxc_short8)
+UPSAMPLE_SCALETO16B_FUN(I16, I16,  vxc_short8,  vxc_short8,  vxc_short8, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx
deleted file mode 100644
index bc5e1d0..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx
+++ /dev/null
@@ -1,111 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-//-----------------------------------------------tensor crop-------------------------------
-__kernel void vxcTensorCrop_Int16(
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output,
-        int offset0,
-        int offset1,
-        int offset2)
-{
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    vxc_ushort8 src0, src1, src2, src3;
-
-    VXC_ReadImage2DArray(src0, input,  coord_in, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input,  coord_in, VXC_5BITOFFSET_XY(0, 1),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input,  coord_in, VXC_5BITOFFSET_XY(0, 2),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input,  coord_in, VXC_5BITOFFSET_XY(0, 3),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\
-        - offset1, get_global_id(2) - offset2, 0);
-
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void vxcTensorCrop_Int8(
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output,
-        int offset0,
-        int offset1,
-        int offset2)
-{
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar16 src0, src1, src2, src3;
-
-    VXC_ReadImage2DArray(src0, input,  coord_in, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input,  coord_in, VXC_5BITOFFSET_XY(0, 1),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input,  coord_in, VXC_5BITOFFSET_XY(0, 2),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input,  coord_in, VXC_5BITOFFSET_XY(0, 3),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1) - offset1,\
-        get_global_id(2) - offset2, 0);
-
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;
-
-__kernel void vxcTensorCrop_Int16_Fp16(
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output,
-        int offset0,
-        int offset1,
-        int offset2)
-{
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    vxc_short8 src0, src1, src2, src3;
-
-    VXC_ReadImage2DArray(src0, input,  coord_in, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input,  coord_in, VXC_5BITOFFSET_XY(0, 1),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input,  coord_in, VXC_5BITOFFSET_XY(0, 2),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input,  coord_in, VXC_5BITOFFSET_XY(0, 3),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\
-        - offset1, get_global_id(2) - offset2, 0);
-
-    vxc_half8 dst0, dst1, dst2, dst3;
-    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt16toFp16_2x8);
-    VXC_DP2x8(dst1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt16toFp16_2x8);
-    VXC_DP2x8(dst2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt16toFp16_2x8);
-    VXC_DP2x8(dst3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt16toFp16_2x8);
-
-    vxc_short8 out0, out1, out2, out3;
-    _viv_asm(COPY, out0, dst0, 16);
-    _viv_asm(COPY, out1, dst1, 16);
-    _viv_asm(COPY, out2, dst2, 16);
-    _viv_asm(COPY, out3, dst3, 16);
-
-    VXC_WriteImage2DArray(output, coord_out, out0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, out1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, out2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.y ++;
-    VXC_WriteImage2DArray(output, coord_out, out3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx
deleted file mode 100644
index a052f8c..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx
+++ /dev/null
@@ -1,63 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int loopNum;
-_viv_uniform VXC_512Bits uniMulAcc_16x1;
-__kernel void vsi_nn_kernel_fullconnect2(
-     __read_only image2d_array_t   input,
-     __read_only image2d_array_t   weight,
-     __read_only image2d_array_t   bias,
-     __write_only image2d_array_t  output)
-{
-    int4 coord_in = (int4)(16, get_global_id(0), get_global_id(1), 0);
-    int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 v0, v1, v2, v3, v4, v5, v6, v7;
-    vxc_half8 i0, i1, i2, i3;
-    vxc_half8 w0, w1, w2, w3;
-    float4 sum = 0;
-    float dst = 0;
-    dst = read_imagef(bias, coord_in.ywww).x;
-    do
-    {
-        VXC_ReadImage(v0, input,  coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, i0, v0, 16);
-        VXC_ReadImage(v1, weight, coord_in.xy, VXC_5BITOFFSET_XY(-16, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, w0, v1, 16);
-        VXC_ReadImage(v2, input,  coord_in.xz, VXC_5BITOFFSET_XY(-8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, i1, v2, 16);
-        VXC_ReadImage(v3, weight, coord_in.xy, VXC_5BITOFFSET_XY(-8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, w1, v3, 16);
-        VXC_ReadImage(v4, input,  coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, i2, v4, 16);
-        VXC_ReadImage(v5, weight, coord_in.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, w2, v5, 16);
-        VXC_ReadImage(v6, input,  coord_in.xz, VXC_5BITOFFSET_XY(8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, i3, v6, 16);
-        VXC_ReadImage(v7, weight, coord_in.xy, VXC_5BITOFFSET_XY(8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, w3, v7, 16);
-
-        coord_in.x += 32;
-
-        VXC_DP16x1(sum, i0, w0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
-        VXC_DP16x1(sum, i1, w1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
-        VXC_DP16x1(sum, i2, w2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
-        VXC_DP16x1(sum, i3, w3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
-
-        float4 tmp = {1, 1, 1, 1};
-        dst = dst + dot(sum, tmp);
-
-    } while (coord_in.x < loopNum);
-
-    vxc_half v;
-    _viv_asm(CONV, v, dst);
-    _viv_asm(COPY, v0, v, 16);
-    VXC_WriteImage(output, coord_out.xy, v0, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx
deleted file mode 100644
index 118764e..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx
+++ /dev/null
@@ -1,129 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/*****************************layernorm uint8 to fp16****************************/
-_viv_uniform int width;
-_viv_uniform float dimRatio;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniSumU8_16x1;
-_viv_uniform VXC_512Bits uniSqrSum_16x1;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-_viv_uniform int sumInZp;
-_viv_uniform int tmpZp1;
-_viv_uniform int tmpZp2;
-_viv_uniform float e2InScale;
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
-_viv_uniform VXC_512Bits UniPackFP16even_2x8;
-
-__kernel void vxcLayerNormU8toFp16(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t output,
-              float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);
-    vxc_uchar16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0;
-    vxc_int4 tmpSum1;
-    vxc_int4 tmpSqr1;
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-        tmpSum += (tmpSum1.x);
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
-    }
-    sum = (tmpSum + sumInZp) * input_scale;
-    sqr = (tmpSqr + tmpZp2) * e2InScale;
-
-    float mean, vari;
-    mean = sum * dimRatio;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    int4 coord_bias = (int4)(0, 0, 0, 0);
-    vxc_half8 scale_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_short8 src1, outval;
-    short zp = inputZP;
-    half4 tmpVal0, tmpVal1;
-    vxc_half8 dst;
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        coord_bias.x = coord.x;
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert2ndUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert3rdUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert4thUint8SubZpToFp32_4x4);
-        tmpData0 *= input_scale;
-        tmpData1 *= input_scale;
-        tmpData2 *= input_scale;
-        tmpData3 *= input_scale;
-
-        vxc_float4 norm;
-        tmpData0 -= mean;
-        norm = scale_f0 * vari * tmpData0 + bias_f0;
-        bias_f0 = read_imagef(bias, coord_bias);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        coord_bias.x += 4;
-        _viv_asm(CONV, tmpVal0, norm);
-
-        tmpData1 -= mean;
-        norm = scale_f1 * vari * tmpData1 + bias_f1;
-        bias_f1 = read_imagef(bias, coord_bias);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-        _viv_asm(CONV, tmpVal1, norm);
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-            UniPackFP16even_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        int2 coord_out = (int2)(coord.x, coord.y);
-        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-        tmpData2 -= mean;
-        norm = scale_f0 * vari * tmpData2 + bias_f0;
-        _viv_asm(CONV, tmpVal0, norm);
-
-        tmpData3 -= mean;
-        norm = scale_f1 * vari * tmpData3 + bias_f1;
-        _viv_asm(CONV, tmpVal1, norm);
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-            UniPackFP16even_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        coord_out.x += 8;
-        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx
deleted file mode 100644
index 8175ced..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx
+++ /dev/null
@@ -1,38 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-//--------------------------resize-------------------------
-_viv_uniform VXC_512Bits uniPackEvenData_2x8;
-__kernel void resize_16bits_downsample_quarter
-    (
-    __read_only image2d_array_t input,
-    __write_only image2d_array_t output
-    )
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-    vxc_short8 src0, src1;
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(8, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    coord = coord >> 1;
-    VXC_DP2x8(src0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardInf, 0), uniPackEvenData_2x8);
-    VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void resize_8bits_downsample_quarter
-    (
-    __read_only image2d_array_t input,
-    __write_only image2d_array_t output
-    )
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-    vxc_char16 src0;
-    vxc_char8 dst;
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    coord = coord >> 1;
-    dst  = src0.s02468ace;
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx
deleted file mode 100644
index 3c9551d..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-//--------------------------scale-------------------------
-_viv_uniform VXC_512Bits uniExtractHalf8_2x8;
-_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Lo_4x4;
-_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Hi_4x4;
-__kernel void scale_fp16
-    (
-    __read_only     image2d_array_t input,
-    __read_only     image2d_array_t weights,
-    __read_only     image2d_array_t biases,
-    __write_only    image2d_array_t output
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
-    vxc_short8 vec0, vec1;
-    vxc_half8  src0;
-    vxc_half8  w0;
-    vxc_float4 b0, b1;
-    vxc_float4 dst0, dst1;
-    VXC_ReadImage(vec0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage(vec1, weights, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, w0, vec1, 16);
-
-    coord.z = coord.x + 4;
-
-    b0 = read_imagef(biases, coord.xwww);
-    b1 = read_imagef(biases, coord.zwww);
-
-    VXC_DP4x4(dst0, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniFp16MulFp16ToFp32_Lo_4x4);
-    VXC_DP4x4(dst1, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniFp16MulFp16ToFp32_Hi_4x4);
-    dst0 += b0;
-    dst1 += b1;
-
-    half4 t0, t1;
-
-    _viv_asm(CONV, t0, dst0);
-    _viv_asm(CONV, t1, dst1);
-
-    VXC_DP2x8(w0, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);
-    _viv_asm(COPY, vec0, w0, 16);
-
-    VXC_WriteImage(output, coord.xy, vec0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx
deleted file mode 100644
index 9800aa8..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx
+++ /dev/null
@@ -1,67 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/******************shuffle channel float16/int16********************/
-_viv_uniform int group_column;
-_viv_uniform float rgroup_column;
-
-__kernel void shuffleChannelVXC(
-    image2d_array_t input,
-    image2d_array_t output,
-    int group_number,
-    int axis)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    vxc_short8 src0, src1, src2, src3;
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    int coordz = coord.z;
-    int index_col = coordz * rgroup_column;
-    int index_row = coordz - index_col * group_column;
-    coord.z = index_row * group_number + index_col;
-    VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-/*****************shuffle channel int8/uint8****************************/
-
-__kernel void shuffleChannel8BitsVXC(
-    image2d_array_t input,
-    image2d_array_t output,
-    int group_number,
-    int axis)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    vxc_char16 src0, src1, src2, src3;
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    int coordz = coord.z;
-    int index_col = coordz * rgroup_column;
-    int index_row = coordz - index_col * group_column;
-    coord.z = index_row * group_number + index_col;
-    VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord.y ++;
-    VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx
deleted file mode 100644
index a4e0fff..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/******************shuffle channel float16/int16********************/
-_viv_uniform int group_column;
-_viv_uniform float rgroup_column;
-
-__kernel void shuffleChannel16Bits_Axis1(
-    image2d_array_t input,
-    image2d_array_t output,
-    int group_number,
-    int axis)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    int4 coord_out = coord;
-    vxc_short8 src0, src1, src2, src3;
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    int coordy = coord.y;
-    int index_col = coordy * rgroup_column;
-    int index_row = coordy - index_col * group_column;
-    coord_out.y = index_row * group_number + index_col;
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.x += 8;
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.x += 8;
-    VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_out.x += 8;
-    VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-/*****************shuffle channel int8/uint8****************************/
-
-__kernel void shuffleChannel8Bits_Axis1(
-    image2d_array_t input,
-    image2d_array_t output,
-    int group_number,
-    int axis)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    int4 coord_out = coord;
-    vxc_char16 src0, src1;
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    coord.x += 16;
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    int coordy = coord.y;
-    int index_col = coordy * rgroup_column;
-    int index_row = coordy - index_col * group_column;
-    coord_out.y = index_row * group_number + index_col;
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    coord_out.x += 16;
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx
deleted file mode 100644
index 01957b0..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx
+++ /dev/null
@@ -1,41 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4;
-_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4;
-_viv_uniform int input_depth;
-
-__kernel void vxcReorg2_fp16_fp16_sx2_sy1
-    (
-    image2d_array_t input,
-    image2d_array_t output,
-    int stridex,
-    int stridey
-    )
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-
-    int4 coord = (int4)(gidx, gidy, gidz, 0);
-    int4 coord_out = (int4)(gidx >> 1, gidy, 0, 0);
-    int out_d0, out_d1;
-    vxc_short8 imageData;
-    vxc_short8 imgVal0, imgVal1;
-    //int tmpw = gidz / input_depth; \n\
-    //int tmpz = gidz % input_depth; \n\
-
-    VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0),
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_DP4x4(imgVal0, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
-        uniExtractEvenFp16Stride2_4x4);
-    VXC_DP4x4(imgVal1, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
-        uniExtractOddFp16Stride2_4x4);
-
-    out_d0 = gidz * 2 * 1;
-    out_d1 = out_d0 + 1;
-
-    coord_out.z = out_d0;
-    VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-    coord_out.z = out_d1;
-    VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index 366041c..fd2db22 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -88,6 +88,84 @@ __kernel void a_times_b_plus_c_F16_F16_F16toF16_2D\n\
     VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\
+_viv_uniform VXC_512Bits uniA_Times_B_lo_4x4;\n\
+_viv_uniform VXC_512Bits uniA_Times_B_hi_4x4;\n\
+__kernel void a_times_b_plus_c_F16_F16_F32toF16\n\
+    (\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __read_only image2d_array_t   input2,\n\
+    __write_only image2d_array_t  output\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_half8   src0, src1, dst;\n\
+    vxc_ushort8 vec0, vec1, result;\n\
+    float4 b0, b1;\n\
+    float4 dst0, dst1;\n\
+\n\
+    VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, src0, vec0, 16);\n\
+    VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, src1, vec1, 16);\n\
+    b0 = read_imagef(input2, coord);\n\
+    coord.x += 4;\n\
+    b1 = read_imagef(input2, coord);\n\
+    coord.x -= 4;\n\
+\n\
+    VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4);\n\
+    VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4);\n\
+    dst0 += b0;\n\
+    dst1 += b1;\n\
+\n\
+    half4 t0, t1;\n\
+    _viv_asm(CONV, t0, dst0);\n\
+    _viv_asm(CONV, t1, dst1);\n\
+    VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\
+    _viv_asm(COPY, result, dst, 16);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void a_times_b_plus_c_F16_F16_F32toF16_2D\n\
+    (\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __read_only image2d_t         input2,\n\
+    __write_only image2d_array_t  output\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    vxc_half8   src0, src1, dst;\n\
+    vxc_ushort8 vec0, vec1, result;\n\
+    float4 b0, b1;\n\
+    float4 dst0, dst1;\n\
+\n\
+    VXC_ReadImage(vec0, input0, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, src0, vec0, 16);\n\
+    VXC_ReadImage(vec1, input1, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, src1, vec1, 16);\n\
+    b0 = read_imagef(input2, coord.xy);\n\
+    coord.z = coord.x + 4;\n\
+    b1 = read_imagef(input2, coord.zy);\n\
+\n\
+    VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4);\n\
+    VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4);\n\
+    dst0 += b0;\n\
+    dst1 += b1;\n\
+\n\
+    half4 t0, t1;\n\
+    _viv_asm(CONV, t0, dst0);\n\
+    _viv_asm(CONV, t1, dst1);\n\
+    VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\
+    _viv_asm(COPY, result, dst, 16);\n\
+\n\
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
 "; /* end of a_times_b_plus_c_vx*/
 
 static const char add_mean_std_norm_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -3828,10 +3906,11 @@ __kernel void floordiv_BF16BF16toBF16_2D\n\
 static const char gather_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int indices_num;\n\
+_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;\n\
 \n\
 __kernel void gather_I8toI8(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_array_t   input1,\n\
+    __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
@@ -3843,7 +3922,7 @@ __kernel void gather_I8toI8(\n\
     int gidz = get_global_id(2);  // block_num\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord_in.xyyy);\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     vxc_char16 src;\n\
@@ -3855,7 +3934,7 @@ __kernel void gather_I8toI8(\n\
 \n\
 __kernel void gather_U8toU8(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_array_t   input1,\n\
+    __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
@@ -3867,7 +3946,7 @@ __kernel void gather_U8toU8(\n\
     int gidz = get_global_id(2);  // block_num\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord_in.xyyy);\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     vxc_uchar16 src;\n\
@@ -3879,7 +3958,7 @@ __kernel void gather_U8toU8(\n\
 \n\
 __kernel void gather_I16toI16(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_array_t   input1,\n\
+    __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
@@ -3893,7 +3972,7 @@ __kernel void gather_I16toI16(\n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
 \n\
 \n\
-    int4 indice = read_imagei(input1, coord_in.xyyy);\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     vxc_short8 src;\n\
@@ -3905,7 +3984,7 @@ __kernel void gather_I16toI16(\n\
 \n\
 __kernel void gather_F16toF16(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_array_t   input1,\n\
+    __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
@@ -3919,7 +3998,7 @@ __kernel void gather_F16toF16(\n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
 \n\
 \n\
-    int4 indice = read_imagei(input1, coord_in.xyyy);\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     vxc_short8 src;\n\
@@ -3928,6 +4007,110 @@ __kernel void gather_F16toF16(\n\
     int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
     VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
+\n\
+__kernel void gather_I8toI8_axis0(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int4 indices = read_imagei(input1, coord.xx);\n\
+    int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
+\n\
+    vxc_char16 src, dst;\n\
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    indices.x = get_global_id(1);\n\
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniExtraCopyDpKeepinEvis_2x8);\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_U8toU8_axis0(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int4 indices = read_imagei(input1, coord.xx);\n\
+    int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
+\n\
+    vxc_uchar16 src, dst;\n\
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    indices.x = get_global_id(1);\n\
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniExtraCopyDpKeepinEvis_2x8);\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_I16toI16_axis0(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int4 indices = read_imagei(input1, coord.xx);\n\
+    int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
+\n\
+    vxc_short8 src, dst;\n\
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    indices.x = get_global_id(1);\n\
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniExtraCopyDpKeepinEvis_2x8);\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_F16toF16_axis0(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int4 indices = read_imagei(input1, coord.xx);\n\
+    int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
+\n\
+    vxc_short8 src, dst;\n\
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    indices.x = get_global_id(1);\n\
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniExtraCopyDpKeepinEvis_2x8);\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
 "; /* end of gather_vx*/
 
 static const char gather_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -3943,7 +4126,7 @@ _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
 #define GATHER_8BITS_TO_F16(src0_type_name, read_type) \\\n\
 __kernel void gather_##src0_type_name##toF16( \\\n\
     __read_only image2d_t   input0, \\\n\
-    __read_only image2d_array_t   input1, \\\n\
+    __read_only image2d_t   input1, \\\n\
     __write_only image2d_t  output, \\\n\
     int block_size, \\\n\
     int block_num, \\\n\
@@ -3955,7 +4138,7 @@ __kernel void gather_##src0_type_name##toF16( \\\n\
     int gidz = get_global_id(2); \\\n\
  \\\n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\
-    int4 indice = read_imagei(input1, coord_in.xyyy); \\\n\
+    int4 indice = read_imagei(input1, coord_in.xy); \\\n\
     coord_in.w = gidz * axis_num + indice.x; \\\n\
  \\\n\
     read_type src; \\\n\
@@ -3979,7 +4162,7 @@ GATHER_8BITS_TO_F16(I8, vxc_char16)\n\
 #define GATHER_F16_TO_QINT(src1_type_name, write_type) \\\n\
 __kernel void gather_F16to##src1_type_name( \\\n\
     __read_only image2d_t   input0, \\\n\
-    __read_only image2d_array_t   input1, \\\n\
+    __read_only image2d_t   input1, \\\n\
     __write_only image2d_t  output, \\\n\
     int block_size, \\\n\
     int block_num, \\\n\
@@ -3991,7 +4174,7 @@ __kernel void gather_F16to##src1_type_name( \\\n\
     int gidz = get_global_id(2); \\\n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\
  \\\n\
-    int4 indice = read_imagei(input1, coord_in.xyyy); \\\n\
+    int4 indice = read_imagei(input1, coord_in.xy); \\\n\
     coord_in.w = gidz * axis_num + indice.x; \\\n\
  \\\n\
     vxc_short8 src; \\\n\
@@ -4011,7 +4194,7 @@ GATHER_F16_TO_QINT(I16, vxc_short8)\n\
 \n\
 __kernel void gather_I16toF16(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_array_t   input1,\n\
+    __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
@@ -4023,7 +4206,7 @@ __kernel void gather_I16toF16(\n\
     int gidz = get_global_id(2);\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord_in.xyyy);\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     vxc_short8 src;\n\
@@ -4041,6 +4224,100 @@ __kernel void gather_I16toF16(\n\
 \n\
     VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
+\n\
+#define GATHER_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \\\n\
+__kernel void gather_##src0_type_name##toF16_axis0( \\\n\
+    __read_only image2d_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int block_num, \\\n\
+    int axis_num \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    int4 indices = read_imagei(input1, coord.xx); \\\n\
+    int2 coord_in = (int2)(indices.x, get_global_id(1)); \\\n\
+ \\\n\
+    read_type src; \\\n\
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    indices.x = get_global_id(1); \\\n\
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    vxc_half8  src0; \\\n\
+    vxc_short8 dst0; \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+    _viv_asm(COPY, dst0, src0, 16); \\\n\
+    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GATHER_8BITS_TO_F16_AXIS0(U8, vxc_uchar16)\n\
+GATHER_8BITS_TO_F16_AXIS0(I8, vxc_char16)\n\
+\n\
+#define GATHER_F16_TO_QINT_AXIS0(src1_type_name, write_type) \\\n\
+__kernel void gather_F16to##src1_type_name##_axis0( \\\n\
+    __read_only image2d_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int block_num, \\\n\
+    int axis_num \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    int4 indices = read_imagei(input1, coord.xx); \\\n\
+    int2 coord_in = (int2)(indices.x, get_global_id(1)); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    indices.x = get_global_id(1); \\\n\
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    vxc_ushort8 mp1; \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    vxc_half8 data; \\\n\
+    write_type dst; \\\n\
+    _viv_asm(COPY, data, src, 16); \\\n\
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GATHER_F16_TO_QINT_AXIS0(U8, vxc_uchar16)\n\
+GATHER_F16_TO_QINT_AXIS0(I8, vxc_char16)\n\
+GATHER_F16_TO_QINT_AXIS0(I16, vxc_short8)\n\
+\n\
+__kernel void gather_I16toF16_axis0(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int4 indices = read_imagei(input1, coord.xx);\n\
+    int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
+\n\
+    vxc_short8 src;\n\
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    indices.x = get_global_id(1);\n\
+    VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_half8  src0;\n\
+    vxc_short8 dst0;\n\
+    vxc_ushort8 ms0;\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
+    VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniU8MulAndPostShift_0_Lo_2x8);\n\
+    _viv_asm(COPY, dst0, src0, 16);\n\
+\n\
+    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
 "; /* end of gather_mix_vx*/
 
 static const char gather_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -7776,6 +8053,2059 @@ L2NORMSCALE_AXIS1_U8_2D(F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4,
 L2NORMSCALE_AXIS1_U8_2D(F16, U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,   vxc_uchar16)\n\
 "; /* end of l2normalizescale_axis1_vx*/
 
+static const char layer_normalization_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+/**************************layernorm float16***********************************/\n\
+_viv_uniform int width;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\
+\n\
+__kernel void layer_norm_F16toF16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    vxc_short8 src0, src1;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
+    {\n\
+        vxc_half8  val0_h;\n\
+        _viv_asm(COPY, val0_h, src0, 16);\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        vxc_float4 sumsqr;\n\
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniFp16SumSqr_dp8x2);\n\
+        sum += sumsqr.x;\n\
+        sqr += sumsqr.y;\n\
+    }\n\
+    vxc_float mean;\n\
+    mean = sum * dimRatio;\n\
+    vxc_float vari;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_float4 bias_f;\n\
+    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f = read_imagef(bias, coord.xw);\n\
+        vxc_half8 in_h, scale_h;\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        vxc_float4 in_f, scale_f;\n\
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        vxc_float4 sub, norm;\n\
+        sub = in_f - mean;\n\
+        norm = scale_f * vari * sub + bias_f;\n\
+        half4 norm_h;\n\
+        _viv_asm(CONV, norm_h, norm);\n\
+        vxc_half8 dst;\n\
+        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniExtractHalf4_dp4x4);\n\
+        vxc_short8 dstval;\n\
+        _viv_asm(COPY, dstval, dst, 16);\n\
+        coord_out.x = coord.x;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+/*****************************layernorm uint8 to uint8****************************/\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
+_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int inputZP;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int sumInZp;\n\
+_viv_uniform int tmpZp1;\n\
+_viv_uniform int tmpZp2;\n\
+_viv_uniform float e2InScale;\n\
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+__kernel void layer_norm_U8toU8(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    vxc_uchar16 src0, src2;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    float sum = 0, sqr = 0;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    int tmpSum = 0, tmpSqr = 0;\n\
+    vxc_int4 tmpSum1;\n\
+    vxc_int4 tmpSqr1;\n\
+    short zp = inputZP;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
+        tmpSum += (tmpSum1.x);\n\
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
+    }\n\
+    sum = (tmpSum + sumInZp) * input_scale;\n\
+    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
+\n\
+    float mean, vari;\n\
+    mean = sum * dimRatio;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_bias.x = coord.x;\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert3rdUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert4thUint8SubZpToFp32_4x4);\n\
+        tmpData0 *= input_scale;\n\
+        tmpData1 *= input_scale;\n\
+        tmpData2 *= input_scale;\n\
+        tmpData3 *= input_scale;\n\
+\n\
+        vxc_float4 norm;\n\
+        tmpData0 -= mean;\n\
+        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        coord_bias.x += 4;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        tmpData1 -= mean;\n\
+        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+\n\
+        tmpData2 -= mean;\n\
+        norm = scale_f0 * vari * tmpData2 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        tmpData3 -= mean;\n\
+        norm = scale_f1 * vari * tmpData3 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+        coord_out.x = coord.x;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \\\n\
+                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+/***************************layernorm float16 to uint8**************************/\n\
+__kernel void layer_norm_F16toU8(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    vxc_short8 src0, src1;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
+    {\n\
+        vxc_half8  val0_h;\n\
+        _viv_asm(COPY, val0_h, src0, 16);\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        vxc_float4 sumsqr;\n\
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniFp16SumSqr_dp8x2);\n\
+        sum += sumsqr.x;\n\
+        sqr += sumsqr.y;\n\
+    }\n\
+    vxc_float mean;\n\
+    mean = sum * dimRatio;\n\
+    vxc_float vari;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_float4 bias_f;\n\
+    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f = read_imagef(bias, coord.xw);\n\
+        vxc_half8 in_h, scale_h;\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        vxc_float4 in_f, scale_f;\n\
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        vxc_float4 sub, norm;\n\
+        sub = in_f - mean;\n\
+        norm = scale_f * vari * sub + bias_f;\n\
+        norm = norm * outputScale + output_zp;\n\
+        int4 output_int4;\n\
+        output_int4 = convert_int4_rte(norm);\n\
+        vxc_uchar8 dst;\n\
+        VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\
+            uniConvertInt32toUint8_2x8);\n\
+        coord_out.x = coord.x;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of layer_normalization_vx*/
+
+static const char layer_normalization_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+/**************************layernorm float16***********************************/\n\
+_viv_uniform int width;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\
+\n\
+__kernel void layer_norm_F16toF16_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
+    vxc_short8 src0, src1;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
+    {\n\
+        vxc_half8  val0_h;\n\
+        _viv_asm(COPY, val0_h, src0, 16);\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        vxc_float4 sumsqr;\n\
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniFp16SumSqr_dp8x2);\n\
+        sum += sumsqr.x;\n\
+        sqr += sumsqr.y;\n\
+    }\n\
+    vxc_float mean;\n\
+    mean = sum * dimRatio;\n\
+    vxc_float vari;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_float4 bias_f;\n\
+    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f = read_imagef(bias, coord.xw);\n\
+        vxc_half8 in_h, scale_h;\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        vxc_float4 in_f, scale_f;\n\
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        vxc_float4 sub, norm;\n\
+        sub = in_f - mean;\n\
+        norm = scale_f * vari * sub + bias_f;\n\
+        half4 norm_h;\n\
+        _viv_asm(CONV, norm_h, norm);\n\
+        vxc_half8 dst;\n\
+        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniExtractHalf4_dp4x4);\n\
+        vxc_short8 dstval;\n\
+        _viv_asm(COPY, dstval, dst, 16);\n\
+        VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+/*****************************layernorm uint8 to uint8****************************/\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
+_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int inputZP;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int sumInZp;\n\
+_viv_uniform int tmpZp1;\n\
+_viv_uniform int tmpZp2;\n\
+_viv_uniform float e2InScale;\n\
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+__kernel void layer_norm_U8toU8_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
+    vxc_uchar16 src0, src2;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    float sum = 0, sqr = 0;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    int tmpSum = 0, tmpSqr = 0;\n\
+    vxc_int4 tmpSum1;\n\
+    vxc_int4 tmpSqr1;\n\
+    short zp = inputZP;\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
+        tmpSum += (tmpSum1.x);\n\
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
+    }\n\
+    sum = (tmpSum + sumInZp) * input_scale;\n\
+    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
+\n\
+    float mean, vari;\n\
+    mean = sum * dimRatio;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_bias.x = coord.x;\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert3rdUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert4thUint8SubZpToFp32_4x4);\n\
+        tmpData0 = tmpData0 * input_scale - mean;\n\
+        tmpData1 = tmpData1 * input_scale - mean;\n\
+        tmpData2 = tmpData2 * input_scale - mean;\n\
+        tmpData3 = tmpData3 * input_scale - mean;\n\
+\n\
+        vxc_float4 norm;\n\
+        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        coord_bias.x += 4;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+\n\
+        norm = scale_f0 * vari * tmpData2 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        norm = scale_f1 * vari * tmpData3 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+/***************************layernorm float16 to uint8**************************/\n\
+__kernel void layer_norm_F16toU8_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
+    vxc_short8 src0, src1;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
+    {\n\
+        vxc_half8  val0_h;\n\
+        _viv_asm(COPY, val0_h, src0, 16);\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        vxc_float4 sumsqr;\n\
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniFp16SumSqr_dp8x2);\n\
+        sum += sumsqr.x;\n\
+        sqr += sumsqr.y;\n\
+    }\n\
+    vxc_float mean;\n\
+    mean = sum * dimRatio;\n\
+    vxc_float vari;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_float4 bias_f;\n\
+    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f = read_imagef(bias, coord.xw);\n\
+        vxc_half8 in_h, scale_h;\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        vxc_float4 in_f, scale_f;\n\
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        vxc_float4 sub, norm;\n\
+        sub = in_f - mean;\n\
+        norm = scale_f * vari * sub + bias_f;\n\
+        norm = norm * outputScale + output_zp;\n\
+        int4 output_int4;\n\
+        output_int4 = convert_int4_rte(norm);\n\
+        vxc_uchar8 dst;\n\
+        VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\
+            uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+"; /* end of layer_normalization_2d_vx*/
+
+static const char layer_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+/**************************layernorm float16***********************************/\n\
+_viv_uniform int width;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform float dimRatio_scale;\n\
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform float e2InScale;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int inputZP;\n\
+\n\
+__kernel void layer_norm_I16toI16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
+\n\
+    vxc_short8 src0, src1, dst;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    for(; coord.x < width;)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord.x += 8;\n\
+        vxc_float4 sumsqr;\n\
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniInt16SumSqr_dp8x2);\n\
+        sum += sumsqr.x;\n\
+        sqr = sqr + sumsqr.y * e2InScale;\n\
+    }\n\
+    vxc_float mean;\n\
+    mean = sum * dimRatio_scale;\n\
+    vxc_float vari;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_half8 scale_h;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 8)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_bias.x = coord.x;\n\
+        VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert2ndUint8SubZpToFp32_4x4);\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        vxc_float4 sub, norm;\n\
+        sub = tmpData0 * input_scale - mean;\n\
+        norm = scale_f0 * vari * sub + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        sub = tmpData1 * input_scale - mean;\n\
+        norm = scale_f1 * vari * sub + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniConvertInt32toUint8_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void layer_norm_I16toI16_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int2 coord = (int2)(0, get_global_id(1));\n\
+\n\
+    vxc_short8 src0, src1, dst;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    for(; coord.x < width;)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord.x += 8;\n\
+        vxc_float4 sumsqr;\n\
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniInt16SumSqr_dp8x2);\n\
+        sum += sumsqr.x;\n\
+        sqr = sqr + sumsqr.y * e2InScale;\n\
+    }\n\
+    vxc_float mean, vari;\n\
+    mean = sum * dimRatio_scale;\n\
+    vari = sqr * dimRatio - mean * mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_half8 scale_h;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 8)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_bias.x = coord.x;\n\
+        VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert2ndUint8SubZpToFp32_4x4);\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                    UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        vxc_float4 sub, norm;\n\
+        sub = tmpData0 * input_scale - mean;\n\
+        norm = scale_f0 * vari * sub + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        sub = tmpData1 * input_scale - mean;\n\
+        norm = scale_f1 * vari * sub + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                    uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+"; /* end of layer_normalization_i16_vx*/
+
+static const char layer_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+/*****************************layernorm uint8 to fp16****************************/\n\
+_viv_uniform int width;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
+_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int inputZP;\n\
+_viv_uniform int sumInZp;\n\
+_viv_uniform int tmpZp1;\n\
+_viv_uniform int tmpZp2;\n\
+_viv_uniform float e2InScale;\n\
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
+_viv_uniform VXC_512Bits UniPackFP16even_2x8;\n\
+\n\
+__kernel void layer_norm_U8toF16(\n\
+    image2d_array_t input,\n\
+    image2d_t bias,\n\
+    image2d_t scale,\n\
+    image2d_array_t output,\n\
+              float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+    vxc_uchar16 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    int tmpSum = 0, tmpSqr = 0;\n\
+    vxc_int4 tmpSum1;\n\
+    vxc_int4 tmpSqr1;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
+        tmpSum += (tmpSum1.x);\n\
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
+    }\n\
+    sum = (tmpSum + sumInZp) * input_scale;\n\
+    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
+\n\
+    float mean, vari;\n\
+    mean = sum * dimRatio;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+    vxc_half8 scale_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_short8 src1, outval;\n\
+    short zp = inputZP;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    vxc_half8 dst;\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_bias.x = coord.x;\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert3rdUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert4thUint8SubZpToFp32_4x4);\n\
+        tmpData0 *= input_scale;\n\
+        tmpData1 *= input_scale;\n\
+        tmpData2 *= input_scale;\n\
+        tmpData3 *= input_scale;\n\
+\n\
+        vxc_float4 norm;\n\
+        tmpData0 -= mean;\n\
+        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        coord_bias.x += 4;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+\n\
+        tmpData1 -= mean;\n\
+        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniPackFP16even_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        coord_out.x = coord.x;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+        tmpData2 -= mean;\n\
+        norm = scale_f0 * vari * tmpData2 + bias_f0;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+\n\
+        tmpData3 -= mean;\n\
+        norm = scale_f1 * vari * tmpData3 + bias_f1;\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniPackFP16even_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        coord_out.x += 8;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void layer_norm_U8toF16_2D(\n\
+    image2d_t input,\n\
+    image2d_t bias,\n\
+    image2d_t scale,\n\
+    image2d_t output,\n\
+        float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
+    vxc_uchar16 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    int tmpSum = 0, tmpSqr = 0;\n\
+    vxc_int4 tmpSum1;\n\
+    vxc_int4 tmpSqr1;\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
+        tmpSum += (tmpSum1.x);\n\
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
+    }\n\
+    sum = (tmpSum + sumInZp) * input_scale;\n\
+    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
+\n\
+    float mean, vari;\n\
+    mean = sum * dimRatio;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+    vxc_half8 scale_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_short8 src1, outval;\n\
+    short zp = inputZP;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    vxc_half8 dst;\n\
+\n\
+    int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        coord_bias.x = coord.x;\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert3rdUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert4thUint8SubZpToFp32_4x4);\n\
+        tmpData0 *= input_scale;\n\
+        tmpData1 *= input_scale;\n\
+        tmpData2 *= input_scale;\n\
+        tmpData3 *= input_scale;\n\
+\n\
+        vxc_float4 norm;\n\
+        tmpData0 -= mean;\n\
+        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        coord_bias.x += 4;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+\n\
+        tmpData1 -= mean;\n\
+        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniPackFP16even_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        coord_out.x = coord.x;\n\
+        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        tmpData2 -= mean;\n\
+        norm = scale_f0 * vari * tmpData2 + bias_f0;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+\n\
+        tmpData3 -= mean;\n\
+        norm = scale_f1 * vari * tmpData3 + bias_f1;\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniPackFP16even_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        coord_out.x += 8;\n\
+        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+"; /* end of layer_normalization_u8_f16_vx*/
+
+static const char layer_normalization_wh_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
+_viv_uniform int width;\n\
+\n\
+_viv_uniform int height;\n\
+\n\
+_viv_uniform int height_depth;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform int group_num;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32(\n\
+    image2d_array_t input, image2d_t output)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    vxc_float4 sumsqr;\n\
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            _viv_asm(COPY, in_h, src0, 16);\n\
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniFp16SumSqr_dp8x2);\n\
+            tmpSumSqr += sumsqr;\n\
+        }\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = tmpSumSqr.x;\n\
+    lcl_sqr[lidx] = tmpSumSqr.y;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        float sum = 0;\n\
+        float sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32_2D(\n\
+    image2d_array_t input, image2d_t output)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int gidy = gidz * height;\n\
+\n\
+    int2 coord = (int2)(gidx, gidy);\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    vxc_float4 sumsqr;\n\
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int endH = gidy + height;\n\
+    if(gidx < width)\n\
+    {\n\
+        for(; coord.y < endH;)\n\
+        {\n\
+            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            _viv_asm(COPY, in_h, src0, 16);\n\
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniFp16SumSqr_dp8x2);\n\
+            tmpSumSqr += sumsqr;\n\
+        }\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = tmpSumSqr.x;\n\
+    lcl_sqr[lidx] = tmpSumSqr.y;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        float sum = 0;\n\
+        float sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\
+    vxc_short8 src0;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h, in_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    vxc_half8 dst;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_para.y = coord.y;\n\
+        coord_bias.y = coord.y;\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        vxc_float4 sub, norm;\n\
+        sub = tmpData0 - mean_vari.s0;\n\
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+        sub = tmpData1 - mean_vari.s0;\n\
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniConvertHalfToFp16_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), 0);\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h, in_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_bias);\n\
+        coord_bias.x += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    coord_bias = coord;\n\
+\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    vxc_half8 dst;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_bias.y = coord.y;\n\
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        vxc_float4 sub, norm;\n\
+        sub = tmpData0 - mean_vari.s0;\n\
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+        sub = tmpData1 - mean_vari.s0;\n\
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniConvertHalfToFp16_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\
+    vxc_short8 src0;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h, in_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_uchar16 outval;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_para.y = coord.y;\n\
+        coord_bias.y = coord.y;\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        vxc_float4 sub, norm;\n\
+        sub = tmpData0 - mean_vari.s0;\n\
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        sub = tmpData1 - mean_vari.s0;\n\
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniConvertInt32toUint8_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), 0);\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h, in_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_bias);\n\
+        coord_bias.x += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    coord_bias = coord;\n\
+\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_uchar16 outval;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_bias.y = coord.y;\n\
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        vxc_float4 sub, norm;\n\
+        sub = tmpData0 - mean_vari.s0;\n\
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        sub = tmpData1 - mean_vari.s0;\n\
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of layer_normalization_wh_f16_vx*/
+
+static const char layer_normalization_wh_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
+_viv_uniform float e2InScale;\n\
+_viv_uniform int width;\n\
+\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int height;\n\
+\n\
+_viv_uniform int height_depth;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform int group_num;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform int inputZP;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32(\n\
+    image2d_array_t input, image2d_t output)\n\
+{\n\
+    int gidx = get_global_id(0) << 4;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_short8 src0;\n\
+    float4 tmpSumSqr = (float4)(0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            vxc_float4 sumsqr;\n\
+            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniInt16SumSqr_dp8x2);\n\
+            tmpSumSqr += sumsqr;\n\
+        }\n\
+        tmpSumSqr.x *= input_scale;\n\
+        tmpSumSqr.y *= e2InScale;\n\
+    }\n\
+    lcl_sum[lidx] = tmpSumSqr.x;\n\
+    lcl_sqr[lidx] = tmpSumSqr.y;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+        float4 data = (float4)(0);\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            data.x += dot(tmp_sum[i], one);\n\
+            data.y += dot(tmp_sqr[i], one);\n\
+        }\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32_2D(\n\
+    image2d_t input, image2d_t output)\n\
+{\n\
+    int gidx = get_global_id(0) << 4;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int gidy = gidz * height;\n\
+\n\
+    int2 coord = (int2)(gidx, gidy);\n\
+    vxc_short8 src0;\n\
+    float4 tmpSumSqr = (float4)(0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int endH = gidy + height;\n\
+    if(gidx < width)\n\
+    {\n\
+        for(; coord.y < endH;)\n\
+        {\n\
+            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            vxc_float4 sumsqr;\n\
+            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniInt16SumSqr_dp8x2);\n\
+            tmpSumSqr += sumsqr;\n\
+        }\n\
+        tmpSumSqr.x *= input_scale;\n\
+        tmpSumSqr.y *= e2InScale;\n\
+    }\n\
+    lcl_sum[lidx] = tmpSumSqr.x;\n\
+    lcl_sqr[lidx] = tmpSumSqr.y;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+        float4 data = (float4)(0);\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            data.x += dot(tmp_sum[i], one);\n\
+            data.y += dot(tmp_sqr[i], one);\n\
+        }\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\
+    vxc_short8 src0, src1, outval;\n\
+    vxc_half8 scale_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_para.y = coord.y;\n\
+        coord_bias.y = coord.y;\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertSecFp16Fp32_4x4);\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniConvertInt32toUint8_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), 0);\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+    vxc_short8 src0, src1, outval;\n\
+    vxc_half8 scale_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_bias);\n\
+        coord_bias.x += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    coord_bias = coord;\n\
+\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_bias.y = coord.y;\n\
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertSecFp16Fp32_4x4);\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of layer_normalization_wh_i16_vx*/
+
+static const char layer_normalization_wh_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
+_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
+_viv_uniform int sumInZp;\n\
+_viv_uniform int tmpZp1;\n\
+_viv_uniform float e2InScale;\n\
+_viv_uniform float rowSumScale;\n\
+_viv_uniform int width;\n\
+\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int height;\n\
+\n\
+_viv_uniform int height_depth;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform int group_num;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform int inputZP;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32(\n\
+    image2d_array_t input, image2d_t output)\n\
+{\n\
+    int gidx = get_global_id(0) << 4;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_uchar16 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
+            tmpSum += (tmpSum1);\n\
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
+            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\
+        }\n\
+        sqr += (tmpSqr * e2InScale + rowSumScale);\n\
+        sum = (tmpSum + sumInZp) * input_scale;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32_2D(\n\
+    image2d_t input, image2d_t output)\n\
+{\n\
+    int gidx = get_global_id(0) << 4;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int gidy = gidz * height;\n\
+\n\
+    int2 coord = (int2)(gidx, gidy);\n\
+    vxc_uchar16 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int endH = gidy + height;\n\
+    if(gidx < width)\n\
+    {\n\
+        for(; coord.y < endH;)\n\
+        {\n\
+            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
+            tmpSum += (tmpSum1);\n\
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
+            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\
+        }\n\
+        sqr += (tmpSqr * e2InScale + rowSumScale);\n\
+        sum = (tmpSum + sumInZp) * input_scale;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\
+    vxc_uchar16 src0;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_para.y = coord.y; coord_bias.y = coord.y;\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertSecFp16Fp32_4x4);\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertHalfToFp16_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), 0);\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+    vxc_uchar16 src0;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_bias);\n\
+        coord_bias.x += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    coord_bias = coord;\n\
+\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_bias.y = coord.y;\n\
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertSecFp16Fp32_4x4);\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertHalfToFp16_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\
+    vxc_uchar16 src0 , outval;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_para.y = coord.y;\n\
+        coord_bias.y = coord.y;\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertSecFp16Fp32_4x4);\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniConvertInt32toUint8_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), 0);\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+    vxc_uchar16 src0, outval;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_bias);\n\
+        coord_bias.x += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    coord_bias = coord;\n\
+\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_bias.y = coord.y;\n\
+        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvertSecFp16Fp32_4x4);\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
+        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of layer_normalization_wh_u8_vx*/
+
 static const char log_softmax_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 _viv_uniform float       rlogE;\n\
 _viv_uniform int         axisSize;\n\
@@ -21179,144 +23509,6 @@ __kernel void pre_process_bgra_copy_U8toU8(\n\
 }\n\
 "; /* end of pre_process_bgra_vx*/
 
-static const char pre_process_bgra_trans_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniBilinearTmp1BgraShort_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp2BgraShort_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp3BgraShort_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp4BgraShort_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp5BgraShort_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp6BgraShort_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp7BgraShort_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp8BgraShort_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform VXC_512Bits uniExtractInt32BgraToU8Bgr_2x8;\n\
-\n\
-_viv_uniform int zp;\n\
-_viv_uniform float outputScale;\n\
-\n\
-__kernel void pre_process_bgra_scale_nhwc_U8toU8(\n\
-    __read_only image2d_array_t input, __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    int4 gidx = get_global_id(0);\n\
-    int gidy = get_global_id(1);\n\
-    gidx += (int4)(0, 1, 2, 3);\n\
-\n\
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\
-    int4 sx = fx & 0xffff8000; // Floor\n\
-    int fy, sy;\n\
-    fx -= sx;\n\
-    sx = sx >> 15;\n\
-    fx = (fx +(1 << 4)) >> 5;\n\
-\n\
-    // for y\n\
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\
-    sy = fy & 0xffff8000; // Floor\n\
-    fy -= sy;\n\
-    sy = sy >> 15;\n\
-\n\
-    sy = sy < 0 ? 0 : sy;\n\
-    fy = fy < 0 ? 0 : fy;\n\
-\n\
-    fy = (fy + (1<< 4)) >> 5;\n\
-    sx = (sx + (*xOffset)) * 4 ;\n\
-    sy += (*yOffset);\n\
-    int4 srcPos = (int4)(sx.x, sy, sy + 1, sx.y);\n\
-    vxc_uchar16 lineBGRA0, lineBGRA1, lineBGRA2, lineBGRA3;\n\
-    vxc_uchar16 dataB, dataG, dataR;\n\
-\n\
-    VXC_ReadImage(lineBGRA0, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),\n\
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(lineBGRA0, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),\n\
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(lineBGRA1, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),\n\
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(lineBGRA1, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),\n\
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.z;\n\
-    srcPos.w = sx.w;\n\
-\n\
-    VXC_ReadImage(lineBGRA2, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),\n\
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(lineBGRA2, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),\n\
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(lineBGRA3, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),\n\
-                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(lineBGRA3, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),\n\
-                     VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_uchar4 val_u8;\n\
-    int4 tmp1, tmp2, result1, result2;\n\
-    float4 tmpDst, tmp0;\n\
-    float4 mean = (float4)(bMean, gMean, rMean, 0);\n\
-    //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);\n\
-    int tmpV = 1 << 19;\n\
-    vxc_short8 tmpFx;\n\
-    VXC_DP2x8(tmpFx, fx, fx, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
-                 uniConvertInt32toUint8_2x8);\n\
-    //tmpFx = fx.xxxx;\n\
-    VXC_DP4x4(tmp1, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
-                 uniBilinearTmp1BgraShort_4x4);\n\
-    VXC_DP4x4(tmp2, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
-                 uniBilinearTmp2BgraShort_4x4);\n\
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniConvertIntergetoF32_4x4);\n\
-    tmpDst = (tmp0 - mean) * var;\n\
-    result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\
-\n\
-    //tmpFx = fx.yyyy;\n\
-    VXC_DP4x4(tmp1, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3BgraShort_4x4);\n\
-    VXC_DP4x4(tmp2, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4BgraShort_4x4);\n\
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniConvertIntergetoF32_4x4);\n\
-    tmpDst = (tmp0 - mean) * var;\n\
-    result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\
-\n\
-    vxc_uchar16 dst;\n\
-    VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1),\n\
-                 uniExtractInt32BgraToU8Bgr_2x8);\n\
-\n\
-    //tmpFx = fx.zzzz;\n\
-    VXC_DP4x4(tmp1, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp5BgraShort_4x4);\n\
-    VXC_DP4x4(tmp2, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp6BgraShort_4x4);\n\
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniConvertIntergetoF32_4x4);\n\
-    tmpDst = (tmp0 - mean) * var;\n\
-    result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\
-\n\
-    //tmpFx = fx.wwww;\n\
-    VXC_DP4x4(tmp1, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp7BgraShort_4x4);\n\
-    VXC_DP4x4(tmp2, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp8BgraShort_4x4);\n\
-    tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\
-    VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniConvertIntergetoF32_4x4);\n\
-    tmpDst = (tmp0 - mean) * var;\n\
-    result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\
-\n\
-    VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(6, 11, 0, VXC_RM_ToNearestEven, 1),\n\
-                 uniExtractInt32BgraToU8Bgr_2x8);\n\
-\n\
-    int4 dstPos = (int4)(get_global_id(0) * 3, gidy, 0, 0);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of pre_process_bgra_trans_vx*/
-
 static const char pre_process_gray_vx[] = "/*\n\
  ============================================================================\n\
  Name        : GrayScale.vx\n\
@@ -22287,97 +24479,6 @@ __kernel void pre_process_nv12_scale_U8toF16_gq(\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
 }"; /* end of pre_process_nv12_scale_mix_vx*/
 
-static const char pre_process_nv12_trans_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int bOrder;\n\
-_viv_uniform int rOrder;\n\
-\n\
-_viv_uniform float outputScaleVar;\n\
-_viv_uniform float bMeanScaleVarZp;\n\
-_viv_uniform float gMeanScaleVarZp;\n\
-_viv_uniform float rMeanScaleVarZp;\n\
-\n\
-_viv_uniform uint xrIntFloat_16;\n\
-_viv_uniform uint yrIntFloat_16;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;\n\
-_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;\n\
-\n\
-__kernel void pre_process_nv12_trans_U8toU8(\n\
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\
-    __write_only image2d_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    uint4 gidx = get_global_id(0);\n\
-    uint gidy = get_global_id(1);\n\
-    gidx += (uint4)(0, 1, 2, 3);\n\
-\n\
-    uint dy = (gidy * yrIntFloat_16) >> 16;\n\
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\
-    int sy = convert_int(dy) + (*yOffset);\n\
-    int4 sx = convert_int4(dx) + (*xOffset);\n\
-    int4 uvX = sx & 0xfffffffe;\n\
-    int uvY = sy >> 1;\n\
-\n\
-    vxc_uchar16 Y, UV;\n\
-    int2 coord = (int2)(sx.x, sy);\n\
-    int2 coord_uv = (int2)(uvX.x, uvY);\n\
-\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.y;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.z;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.w;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.y;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.z;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.w;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_char16 tmpUV;\n\
-    short tmpVal = 128;\n\
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\
-\n\
-    float4 tmpDstB, tmpDstG, tmpDstR;\n\
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\
-\n\
-    int4 result, dstR, dstG, dstB;\n\
-    vxc_uchar16 dst, tmpPack;\n\
-    dstB = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\
-    dstG = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\
-    dstR = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\
-\n\
-    if(bOrder == 2)\n\
-    {\n\
-        int4 exchangeData = dstB;\n\
-        dstB = dstR;\n\
-        dstR = exchangeData;\n\
-    }\n\
-\n\
-    VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);\n\
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);\n\
-\n\
-    int2 dstPos = (int2)(get_global_id(0) * 3, gidy);\n\
-    VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of pre_process_nv12_trans_u8_vx*/
-
 static const char pre_process_rgb_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniVecShift10;\n\
@@ -22711,276 +24812,6 @@ IMAGE_PRE_PROCESS_COPY_8BITS(U8, vxc_uchar16)\n\
 IMAGE_PRE_PROCESS_COPY_8BITS(I8, vxc_char16)\n\
 "; /* end of pre_process_rgb_copy_vx*/
 
-static const char pre_process_rgb_copy_trans_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform float outputZP;\n\
-_viv_uniform VXC_512Bits uniNormilizationLo_2x8;\n\
-_viv_uniform VXC_512Bits uniNormilizationHi_2x8;\n\
-#define IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(dst_name, dst_type, copy_type) \\\n\
-__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \\\n\
-    ( \\\n\
-    __read_only image2d_array_t  input, \\\n\
-    __write_only image2d_array_t output, \\\n\
-         global int              *xRatio, \\\n\
-         global int              *yRatio, \\\n\
-         global int              *xOffset, \\\n\
-         global int              *yOffset, \\\n\
-                float            rMean, \\\n\
-                float            gMean, \\\n\
-                float            bMean, \\\n\
-                float            f32Var, \\\n\
-                int           reverse_channel, \\\n\
-                int           trans \\\n\
-    ) \\\n\
-{ \\\n\
-    int2 coord      = (int2)(get_global_id(0), get_global_id(1)); \\\n\
- \\\n\
-    coord.xy += (int2) (*xOffset, *yOffset); \\\n\
-    vxc_uchar16 src0, src1; \\\n\
-    dst_type   dst0, dst1; \\\n\
-    copy_type   dst; \\\n\
- \\\n\
-    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    f32Var *= outputScale; \\\n\
-    float4  paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\
-        bMean * f32Var - outputZP, f32Var); \\\n\
-    half4 paramData_f16; \\\n\
-    _viv_asm(CONV, paramData_f16, paramData); \\\n\
- \\\n\
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0)); \\\n\
-    coord_out.z = coord_out.x + 8; \\\n\
- \\\n\
-    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniNormilizationLo_2x8); \\\n\
-    VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniNormilizationHi_2x8); \\\n\
-    _viv_asm(COPY, dst, dst0, 16); \\\n\
-    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    _viv_asm(COPY, dst, dst1, 16); \\\n\
-    VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \\\n\
-}\n\
-IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(I16, vxc_short8,  vxc_short8)\n\
-IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(F16, vxc_half8,   vxc_short8)\n\
-\n\
-#define IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(dst_name, dst_type) \\\n\
-__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \\\n\
-    ( \\\n\
-    __read_only image2d_array_t  input, \\\n\
-    __write_only image2d_array_t output, \\\n\
-         global int              *xRatio, \\\n\
-         global int              *yRatio, \\\n\
-         global int              *xOffset, \\\n\
-         global int              *yOffset, \\\n\
-                float            rMean, \\\n\
-                float            gMean, \\\n\
-                float            bMean, \\\n\
-                float            f32Var, \\\n\
-                int              reverse_channel, \\\n\
-                int              trans \\\n\
-    ) \\\n\
-{ \\\n\
-    int2 coord      = (int2)(get_global_id(0), get_global_id(1)); \\\n\
-    coord.xy += (int2) (*xOffset, *yOffset); \\\n\
-    vxc_uchar16 src0, src1; \\\n\
-    dst_type dst; \\\n\
- \\\n\
-    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    f32Var *= outputScale; \\\n\
-    float4  paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\
-        bMean * f32Var - outputZP, f32Var); \\\n\
- \\\n\
-    half4 paramData_f16; \\\n\
-    _viv_asm(CONV, paramData_f16, paramData); \\\n\
- \\\n\
-    int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\
- \\\n\
-    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniNormilizationLo_2x8); \\\n\
-    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniNormilizationHi_2x8); \\\n\
-    VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); \\\n\
-}\n\
-IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(U8, vxc_uchar16)\n\
-IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(I8, vxc_char16)\n\
-"; /* end of pre_process_rgb_copy_trans_vx*/
-
-static const char pre_process_rgb_trans_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniVecShift10;\n\
-_viv_uniform VXC_512Bits uniAddRShift;\n\
-_viv_uniform VXC_512Bits uniGetTempVal;\n\
-_viv_uniform VXC_512Bits uniExtractBytes;\n\
-_viv_uniform VXC_512Bits uniUnpackToR;\n\
-_viv_uniform VXC_512Bits uniUnpackToG;\n\
-_viv_uniform VXC_512Bits uniUnpackToB;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
-_viv_uniform float outputZP;\n\
-\n\
-_viv_uniform VXC_512Bits uniRePackRGBLo_2x8;\n\
-_viv_uniform VXC_512Bits uniRePackRGBHi_2x8;\n\
-#define IMAGE_PRE_PROCESS_NHWC(dst_name, conv_type, dst_type, copy_type) \\\n\
-__kernel void pre_process_rgb_scale_nhwc_U8to##dst_name \\\n\
-    ( \\\n\
-__read_only image2d_array_t  input, \\\n\
-__write_only image2d_array_t output, \\\n\
-        global int           *xRatio, \\\n\
-        global int           *yRatio, \\\n\
-        global int           *xOffset, \\\n\
-        global int           *yOffset, \\\n\
-               float         rMean, \\\n\
-               float         gMean, \\\n\
-               float         bMean, \\\n\
-               float         f32Var, \\\n\
-               int           reverse_channel, \\\n\
-               int           trans \\\n\
-    ) \\\n\
-{ \\\n\
-    int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
-    int4 xPos       = get_global_id(0); \\\n\
-    int yPos        = get_global_id(1); \\\n\
-    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
-    xPos += (int4)(0, 1, 2, 3); \\\n\
- \\\n\
-    /*x*/ \\\n\
-    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
-    int4 sx = fx0 & 0xffff8000; \\\n\
-    fx0 -= sx; \\\n\
-    sx = sx >> 15; \\\n\
- \\\n\
-    vxc_short4 fx; \\\n\
-    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\
-    /*y*/ \\\n\
-    int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
-    int sy = fy & 0xffff8000; \\\n\
- \\\n\
-    fy -= sy; \\\n\
-    sy = sy >> 15; \\\n\
- \\\n\
-    fy = (fy + (1<< 4)) >> 5; \\\n\
- \\\n\
-    vxc_uchar16 line0RGB1, line0RGB2; \\\n\
-    vxc_uchar16 line1RGB3, line1RGB4; \\\n\
-    int4 coord; \\\n\
-    sx = sx * 3 + *xOffset; \\\n\
-    coord.xyz    = sx.xyz; \\\n\
-    coord.w        = sy + *yOffset; \\\n\
-    int2 coord1 = (int2)(sx.w, coord.w); \\\n\
-    VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
-        VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
-        VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
-        VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
-        VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \\\n\
- \\\n\
-    bgrMean *= f32Var; \\\n\
- \\\n\
-    int4 test01, temp1; \\\n\
-    int4 test02, temp2; \\\n\
-    int4 tt; \\\n\
-    vxc_uchar4 val; \\\n\
-    int4 coord_out = (int4)(xPos.x * 3, yPos, xPos.x * 3 + 6, 0); \\\n\
- \\\n\
-    vxc_uchar8 line1, line2; \\\n\
- \\\n\
-    /*R*/ \\\n\
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\
- \\\n\
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
-    temp1 = temp1 + test01; \\\n\
- \\\n\
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
-    temp2 = temp2 + test02; \\\n\
-    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
- \\\n\
-    vxc_float4 tmp_dst; \\\n\
-    vxc_uchar4 u8_dst; \\\n\
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
-    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
-        uniConvertIntergetoF32_4x4); \\\n\
- \\\n\
-    /*convert U8 to dst*/ \\\n\
-    dst_type dstRG, dstB, dst; \\\n\
-    tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \\\n\
-    tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
-    conv_type dst0; \\\n\
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
-    VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
- \\\n\
-    /*G*/ \\\n\
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\
- \\\n\
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
-    temp1 = temp1 + test01; \\\n\
- \\\n\
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
-    temp2 = temp2 + test02; \\\n\
-    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
- \\\n\
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
-    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
-        uniConvertIntergetoF32_4x4); \\\n\
- \\\n\
-    tmp_dst = tmp_dst * f32Var - bgrMean.y; \\\n\
-    tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
-    VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
- \\\n\
-    /*B*/ \\\n\
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\
- \\\n\
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
-    temp1 = temp1 + test01; \\\n\
- \\\n\
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
-    temp2 = temp2 + test02; \\\n\
-    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
- \\\n\
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
-    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
-        uniConvertIntergetoF32_4x4); \\\n\
- \\\n\
-    tmp_dst = tmp_dst * f32Var - bgrMean.x; \\\n\
-    tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
-    VXC_DP2x8(dstB, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
-    VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBLo_2x8); \\\n\
-    copy_type result; \\\n\
-    _viv_asm(COPY, result, dst, 16); \\\n\
-    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBHi_2x8); \\\n\
-    _viv_asm(COPY, result, dst, 16); \\\n\
-    VXC_WriteImage(output, coord_out.zy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-}\n\
-IMAGE_PRE_PROCESS_NHWC(U8,  uint4, vxc_uchar16, vxc_uchar16)\n\
-IMAGE_PRE_PROCESS_NHWC(I8,  int4,  vxc_char16,  vxc_char16)\n\
-IMAGE_PRE_PROCESS_NHWC(I16, int4,  vxc_short8,  vxc_short8)\n\
-IMAGE_PRE_PROCESS_NHWC(F16, half4, vxc_half8,   vxc_short8)\n\
-"; /* end of pre_process_rgb_trans_vx*/
-
 static const char pre_process_yuv420_copy_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;\n\
@@ -23006,19 +24837,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;\n\
 _viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;\n\
 _viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\
 \n\
-_viv_uniform VXC_512Bits uniPackBG0_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;\n\
-_viv_uniform VXC_512Bits uniPackRB0_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;\n\
-_viv_uniform VXC_512Bits uniPackGR1_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;\n\
-_viv_uniform VXC_512Bits uniPackBG1_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;\n\
-_viv_uniform VXC_512Bits uniPackRB2_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;\n\
-_viv_uniform VXC_512Bits uniPackGR2_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;\n\
-\n\
 _viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;\n\
 _viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;\n\
 _viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;\n\
@@ -23128,140 +24946,6 @@ __kernel void pre_process_yuv420_copy_U8toU8(\n\
     VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-// store bgrbgrbgr\n\
-__kernel void pre_process_yuv420_copy_trans_U8(\n\
-    __read_only image2d_t            y_img,\n\
-    __read_only image2d_t            u_img,\n\
-    __read_only image2d_t            v_img,\n\
-    __write_only image2d_array_t    output,\n\
-        global int *                xRatio,\n\
-        global int *                yRatio,\n\
-        global int *               xOffset,\n\
-        global int *               yOffset,\n\
-               float                 rMean,\n\
-               float                 gMean,\n\
-               float                 bMean,\n\
-               float                   var,\n\
-               int         reverse_channel,\n\
-               int                   trans\n\
-    )\n\
-{\n\
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\
-    int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0);\n\
-    vxc_uchar16 Y;\n\
-    vxc_uchar8 U, V;\n\
-    vxc_int4 C0, C1, C2, C3;\n\
-    vxc_uchar16 R, G, B;\n\
-    vxc_uchar16 dst;\n\
-\n\
-    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    var *= outputScale;\n\
-    float4  paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\
-        rMean * var - zp, var);\n\
-    half4 paramData_f16;\n\
-    _viv_asm(CONV, paramData_f16, paramData);\n\
-\n\
-    //C = Y - 16;\n\
-    //D = U - 128;\n\
-    //E = V - 128;\n\
-    // calculate R\n\
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]\n\
-    int tmpV = -56992;\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\
-\n\
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-\n\
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\
-\n\
-    // calculate G\n\
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\
-    // 298Y - 208V\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\
-    // 34784 - 100U\n\
-    ushort tmpG = 34784;\n\
-    vxc_ushort8 tmpDstG;\n\
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\
-    VXC_DP4x4(dst, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\
-    VXC_DP4x4(dst, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\
-    VXC_DP4x4(dst, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);\n\
-    VXC_DP4x4(dst, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);\n\
-\n\
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\
-\n\
-    // calculate B\n\
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\
-    tmpV = -70688;\n\
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-\n\
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\
-\n\
-    // reorder to bgr\n\
-    vxc_uchar8 tmpdst0, tmpdst1;\n\
-    vxc_uchar16 dst0, dst1, dst2;\n\
-\n\
-    if(bOrder == 2)\n\
-    {\n\
-        vxc_uchar16 exchangeData = B;\n\
-        B = R;\n\
-        R = exchangeData;\n\
-    }\n\
-\n\
-    // BGR BGR BG\n\
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);\n\
-    VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);\n\
-\n\
-    // RBG RBG RB\n\
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);\n\
-    VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);\n\
-\n\
-    pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);\n\
-\n\
-    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    pos.x += 16;\n\
-\n\
-    // GRB GRB GR\n\
-    VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);\n\
-    VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);\n\
-\n\
-    // BGR BGR BG\n\
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);\n\
-    VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);\n\
-\n\
-    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    pos.x += 16;\n\
-\n\
-    // RBG RBG RB\n\
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);\n\
-    VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);\n\
-\n\
-    // GRB GRB GR\n\
-    VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);\n\
-    VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);\n\
-\n\
-    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
 "; /* end of pre_process_yuv420_copy_u8_vx*/
 
 static const char pre_process_yuv420_scale_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -24182,242 +25866,6 @@ __kernel void pre_process_yuv420_scale_U8toU8(\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
 }"; /* end of pre_process_yuv420_scale_u8_vx*/
 
-static const char pre_process_yuv420_trans_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;\n\
-_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;\n\
-\n\
-_viv_uniform int bOrder;\n\
-_viv_uniform int rOrder;\n\
-_viv_uniform int zp;\n\
-_viv_uniform float outputScale;\n\
-\n\
-__kernel void pre_process_yuv420_trans_U8toU8(\n\
-    __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\
-    __read_only image2d_array_t v_img, __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    int4 gidx = get_global_id(0);\n\
-    int gidy = get_global_id(1);\n\
-    gidx += (int4)(0, 1, 2, 3);\n\
-\n\
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\
-    int4 sx = fx & 0xffff8000; // Floor\n\
-    int fy, sy;\n\
-    fx -= sx;\n\
-    sx = sx >> 15;\n\
-    fx = (fx +(1 << 4)) >> 5;\n\
-\n\
-    // for y\n\
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\
-    sy = fy & 0xffff8000; // Floor\n\
-    fy -= sy;\n\
-    sy = sy >> 15;\n\
-\n\
-    sy = sy < 0 ? 0 : sy;\n\
-    fy = fy < 0 ? 0 : fy;\n\
-\n\
-    fy = (fy + (1<< 4)) >> 5;\n\
-    sx += (*xOffset);\n\
-    sy += (*yOffset);\n\
-    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\
-    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\
-    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\
-\n\
-    vxc_uchar16 Y, U, V;\n\
-    vxc_int4 C0, C1, C2, C3;\n\
-    vxc_uchar16 R, G, B;\n\
-\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.x + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.x + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.y;\n\
-    srcPos1.x = sx.y >> 1;\n\
-    srcPos2.x = sx.y >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.y + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.y + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.z;\n\
-    srcPos1.x = sx.z >> 1;\n\
-    srcPos2.x = sx.z >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.z + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.z + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.w;\n\
-    srcPos1.x = sx.w >> 1;\n\
-    srcPos2.x = sx.w >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.w + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.w + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    //C = Y - 16; D = U - 128; E = V - 128;\n\
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]\n\
-    int tmpV = -56992;\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-\n\
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\
-    // 298Y - 208V\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\
-    // 34784 - 100U\n\
-    ushort tmpG = 34784;\n\
-    vxc_ushort8 tmpDstG, tmpDstG1;\n\
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\
-\n\
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\
-    tmpV = -70688;\n\
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-\n\
-    int4 result, temp1, temp2, dstR, dstG, dstB;\n\
-    int4 tmpData0, tmpData1;\n\
-\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    // temp2 - temp1\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-\n\
-    tmpV = 1 << 19;\n\
-    vxc_uchar8 dst, tmpPack;\n\
-    float4 tmpDst;\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - bMean) * var;\n\
-    dstB = convert_int4_rte(tmpDst * outputScale + zp);\n\
-\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - gMean) * var;\n\
-    dstG = convert_int4_rte(tmpDst * outputScale + zp);\n\
-\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - rMean) * var;\n\
-    dstR = convert_int4_rte(tmpDst * outputScale + zp);\n\
-\n\
-    if(bOrder == 2)\n\
-    {\n\
-        int4 exchangeData = dstB;\n\
-        dstB = dstR;\n\
-        dstR = exchangeData;\n\
-    }\n\
-\n\
-    VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);\n\
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);\n\
-\n\
-    int2 dstPos = (int2)(get_global_id(0) * 3, gidy);\n\
-    VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pre_process_yuv420_trans_u8_vx*/
-
 static const char pre_process_yuv444_copy_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;\n\
@@ -24442,19 +25890,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;\n\
 _viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;\n\
 _viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\
 \n\
-_viv_uniform VXC_512Bits uniPackBG0_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;\n\
-_viv_uniform VXC_512Bits uniPackRB0_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;\n\
-_viv_uniform VXC_512Bits uniPackGR1_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;\n\
-_viv_uniform VXC_512Bits uniPackBG1_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;\n\
-_viv_uniform VXC_512Bits uniPackRB2_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;\n\
-_viv_uniform VXC_512Bits uniPackGR2_2x8;\n\
-_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;\n\
-\n\
 _viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;\n\
 _viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;\n\
 _viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;\n\
@@ -24563,140 +25998,7 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\
     pos.z = rOrder;\n\
     VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
-\n\
-// store bgrbgrbgr\n\
-__kernel void pre_process_yuv444_copy_trans_U8(\n\
-    __read_only image2d_t            y_img,\n\
-    __read_only image2d_t            u_img,\n\
-    __read_only image2d_t            v_img,\n\
-    __write_only image2d_array_t    output,\n\
-        global int *                xRatio,\n\
-        global int *                yRatio,\n\
-        global int *               xOffset,\n\
-        global int *               yOffset,\n\
-               float                 rMean,\n\
-               float                 gMean,\n\
-               float                 bMean,\n\
-               float                   var,\n\
-               int         reverse_channel,\n\
-               int                   trans\n\
-    )\n\
-{\n\
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\
-    vxc_uchar16 Y, U, V;\n\
-    vxc_int4 C0, C1, C2, C3;\n\
-    vxc_uchar16 R, G, B;\n\
-    vxc_uchar16 dst;\n\
-\n\
-    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    var *= outputScale;\n\
-    float4  paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\
-        rMean * var - zp, var);\n\
-    half4 paramData_f16;\n\
-    _viv_asm(CONV, paramData_f16, paramData);\n\
-\n\
-    //C = Y - 16;\n\
-    //D = U - 128;\n\
-    //E = V - 128;\n\
-    // calculate R\n\
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]\n\
-    int tmpV = -56992;\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\
-\n\
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-\n\
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\
-    VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\
-\n\
-    // calculate G\n\
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\
-    // 298Y - 208V\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\
-    // 34784 - 100U\n\
-    ushort tmpG = 34784;\n\
-    vxc_ushort8 tmpDstG0, tmpDstG1;\n\
-    VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8);\n\
-    VXC_DP4x4(dst, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\
-    VXC_DP4x4(dst, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\
-    VXC_DP4x4(dst, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\
-    VXC_DP4x4(dst, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\
-\n\
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\
-    VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\
-\n\
-    // calculate B\n\
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\
-    tmpV = -70688;\n\
-    VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-\n\
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\
-    VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\
-\n\
-    // reorder to bgr\n\
-    vxc_uchar8 tmpdst0, tmpdst1;\n\
-    vxc_uchar16 dst0, dst1, dst2;\n\
-\n\
-    if(bOrder == 2)\n\
-    {\n\
-        vxc_uchar16 exchangeData = B;\n\
-        B = R;\n\
-        R = exchangeData;\n\
-    }\n\
-\n\
-    // BGR BGR BG\n\
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);\n\
-    VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);\n\
-\n\
-    // RBG RBG RB\n\
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);\n\
-    VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);\n\
-\n\
-    pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);\n\
-\n\
-    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    pos.x += 16;\n\
-\n\
-    // GRB GRB GR\n\
-    VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);\n\
-    VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);\n\
-\n\
-    // BGR BGR BG\n\
-    VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);\n\
-    VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);\n\
-\n\
-    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    pos.x += 16;\n\
-\n\
-    // RBG RBG RB\n\
-    VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);\n\
-    VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);\n\
-\n\
-    // GRB GRB GR\n\
-    VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);\n\
-    VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);\n\
-\n\
-    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pre_process_yuv444_copy_u8_vx*/
+"; /* end of pre_process_yuv444_copy_u8_vx*/
 
 static const char pre_process_yuv444_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -25086,203 +26388,6 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
 }"; /* end of pre_process_yuv444_scale_fp16_vx*/
 
-static const char pre_process_yuv444_trans_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;\n\
-_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;\n\
-\n\
-_viv_uniform int bOrder;\n\
-_viv_uniform int rOrder;\n\
-_viv_uniform int zp;\n\
-_viv_uniform float outputScale;\n\
-\n\
-#define IMAGE_PRE_PROCESS_YUV444_TRANS(dst_name, dst_type) \\\n\
-__kernel void pre_process_yuv444_trans_U8to##dst_name( \\\n\
-    __read_only image2d_t y_img, __read_only image2d_t u_img, \\\n\
-    __read_only image2d_t v_img, __write_only image2d_t    output, \\\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \\\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \\\n\
-{ \\\n\
-    int4 gidx = get_global_id(0); \\\n\
-    int gidy = get_global_id(1); \\\n\
-    gidx += (int4)(0, 1, 2, 3); \\\n\
- \\\n\
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \\\n\
-    int4 sx = fx & 0xffff8000;  \\\n\
-    int fy, sy; \\\n\
-    fx -= sx; \\\n\
-    sx = sx >> 15; \\\n\
-    fx = (fx +(1 << 4)) >> 5; \\\n\
- \\\n\
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \\\n\
-    sy = fy & 0xffff8000;  \\\n\
-    fy -= sy; \\\n\
-    sy = sy >> 15; \\\n\
- \\\n\
-    sy = sy < 0 ? 0 : sy; \\\n\
-    fy = fy < 0 ? 0 : fy; \\\n\
- \\\n\
-    fy = (fy + (1<< 4)) >> 5; \\\n\
-    sx += (*xOffset); \\\n\
-    sy += (*yOffset); \\\n\
-    int2 srcPos = (int2)(sx.x, sy); \\\n\
- \\\n\
-    vxc_uchar16 Y, U, V; \\\n\
-    vxc_int4 C0, C1, C2, C3; \\\n\
-    vxc_uchar16 R, G, B; \\\n\
- \\\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    srcPos.x = sx.y; \\\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    srcPos.x = sx.z; \\\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    srcPos.x = sx.w; \\\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    int tmpV = -56992; \\\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \\\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \\\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \\\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \\\n\
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
- \\\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \\\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \\\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \\\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \\\n\
- \\\n\
-    ushort tmpG = 34784; \\\n\
-    vxc_ushort8 tmpDstG, tmpDstG1; \\\n\
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \\\n\
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\
- \\\n\
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \\\n\
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \\\n\
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \\\n\
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \\\n\
-    tmpV = -70688; \\\n\
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
- \\\n\
-    int4 result, temp1, temp2, dstR, dstG, dstB; \\\n\
-    int4 tmpData0, tmpData1; \\\n\
- \\\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\
-    temp1 = fx * tmpData0 + tmpData1; \\\n\
- \\\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\
-    temp2 = fx * tmpData0 + tmpData1; \\\n\
-    result = fy * temp2 + (temp1 << 10); \\\n\
- \\\n\
-    tmpV = 1 << 19; \\\n\
-    dst_type dst, tmpPack; \\\n\
-    float4 tmpDst; \\\n\
- \\\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - bMean) * var; \\\n\
-    dstB = convert_int4_rte(tmpDst * outputScale + zp); \\\n\
- \\\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\
-    temp1 = fx * tmpData0 + tmpData1; \\\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\
-    temp2 = fx * tmpData0 + tmpData1; \\\n\
-    result = fy * temp2 + (temp1 << 10); \\\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - gMean) * var; \\\n\
-    dstG = convert_int4_rte(tmpDst * outputScale + zp); \\\n\
- \\\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\
-    temp1 = fx * tmpData0 + tmpData1; \\\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\
-    temp2 = fx * tmpData0 + tmpData1; \\\n\
-    result = fy * temp2 + (temp1 << 10); \\\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - rMean) * var; \\\n\
-    dstR = convert_int4_rte(tmpDst * outputScale + zp); \\\n\
- \\\n\
-    if(bOrder == 2) \\\n\
-    { \\\n\
-        int4 exchangeData = dstB; \\\n\
-        dstB = dstR; \\\n\
-        dstR = exchangeData; \\\n\
-    } \\\n\
- \\\n\
-    VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); \\\n\
-    VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); \\\n\
- \\\n\
-    int2 dstPos = (int2)(get_global_id(0) * 3, gidy); \\\n\
-    VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
-}\n\
-IMAGE_PRE_PROCESS_YUV444_TRANS(U8, vxc_uchar16)"; /* end of pre_process_yuv444_trans_u8_vx*/
-
 static const char prelu_vx[] = "\n\
 #include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -29709,37 +30814,34 @@ __kernel void resize_bilinear_BF16toBF16_DOWN\n\
     float  top_y_f      = floor(in_y);\n\
     float  y_lerp       = in_y - top_y_f;\n\
     int    top_y_idx    = convert_int(top_y_f);\n\
-    int    bottom_y_idx = top_y_idx + 1;\n\
     vxc_short8 top;\n\
     vxc_short8 bottom;\n\
     vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(top, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(top, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(top, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(top, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.y = bottom_y_idx;\n\
-    coord_in.x = left_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     vxc_ushort8 src;\n\
     float4 left4;\n\
@@ -29765,7 +30867,14 @@ __kernel void resize_bilinear_BF16toBF16_DOWN\n\
     vxc_ushort8 tmp, dst;\n\
     _viv_asm(COPY, tmp, dst4, 16);\n\
     dst.s0123 = tmp.s1357;\n\
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void resize_bilinear_BF16toBF16_UP\n\
@@ -29788,22 +30897,24 @@ __kernel void resize_bilinear_BF16toBF16_UP\n\
     float  top_y_f     = floor(in_y);\n\
     float  y_lerp      = in_y - top_y_f;\n\
     int    top_y_idx   = convert_int(top_y_f);\n\
-    float  bottom_y_f  = ceil(in_y);\n\
-    int    bottom_y_idx= convert_int(bottom_y_f);\n\
     vxc_ushort8 src0, src1, src2, src3, dst0, dst1;\n\
     vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-    coord_in.y = bottom_y_idx;\n\
-    VXC_ReadImage2DArray(src2, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src3, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     vxc_ushort8 bitextract_p0;\n\
     vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\
@@ -29813,29 +30924,36 @@ __kernel void resize_bilinear_BF16toBF16_UP\n\
     VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\
     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\
 \n\
-    do\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 left4;\n\
+    float4 right4;\n\
+    float4 top4;\n\
+    float4 bottom4;\n\
+\n\
+    int loop = depth - 1;\n\
+    while (coord_in.z < loop)\n\
     {\n\
         VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
         VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-        coord_in.z ++;\n\
-        coord_in.y = top_y_idx;\n\
-        VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-        coord_in.y = bottom_y_idx;\n\
-        VXC_ReadImage2DArray(src2, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage2DArray(src3, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.w += input_desc.s4;\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.z ++;\n\
 \n\
         vxc_ushort8 dst_tmp;\n\
-        float4 left4;\n\
-        float4 right4;\n\
-        float4 top4;\n\
-        float4 bottom4;\n\
+\n\
 \n\
         VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
         _viv_asm(COPY, left4, dst_tmp, 16);\n\
@@ -29857,16 +30975,39 @@ __kernel void resize_bilinear_BF16toBF16_UP\n\
         vxc_ushort8 tmp, dst;\n\
         _viv_asm(COPY, tmp, dst4, 16);\n\
         dst.s0123 = tmp.s1357;\n\
-        VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
-        coord_out.z ++;\n\
-    } while (coord_in.z < depth);\n\
+\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.w += output_desc.s4;\n\
+    }\n\
+\n\
+    VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    vxc_ushort8 dst_tmp;\n\
+    VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, left4, dst_tmp, 16);\n\
+    VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, right4, dst_tmp, 16);\n\
+    right4     -= left4;\n\
+    top4        = right4 * x_lerp + left4;\n\
+    VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, left4, dst_tmp, 16);\n\
+    VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, right4, dst_tmp, 16);\n\
+    right4     -= left4;\n\
+    bottom4     = right4 * x_lerp + left4;\n\
+    bottom4     -= top4;\n\
+    float4 dst4  = bottom4 * y_lerp + top4;\n\
+    vxc_ushort8 tmp, dst;\n\
+    _viv_asm(COPY, tmp, dst4, 16);\n\
+    dst.s0123 = tmp.s1357;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 "; /* end of resize_bilinear_BF16_vx*/
 
 static const char resize_bilinear_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
-_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_left_4x4;\n\
 _viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\
 _viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\
 _viv_uniform float2 scale_xy;\n\
@@ -29892,94 +31033,66 @@ __kernel void resize_bilinear_F16toF16_DOWN\n\
     float4 left_x_f    = floor(in_x);\n\
     float4 x_lerp      = in_x - left_x_f;\n\
     int4   left_x_idx  = convert_int4(left_x_f);\n\
-    float4 right_x_f   = ceil(in_x);\n\
-    int4   right_x_idx = convert_int4(right_x_f);\n\
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\
     float  top_y_f     = floor(in_y);\n\
     float  y_lerp      = in_y - top_y_f;\n\
     int    top_y_idx   = convert_int(top_y_f);\n\
-    float  bottom_y_f  = ceil(in_y);\n\
-    int    bottom_y_idx= convert_int(bottom_y_f);\n\
-    vxc_short8 top_left0, top_right0;\n\
-    vxc_short8 bottom_left0, bottom_right0;\n\
-    vxc_half8 top_left, top_right;\n\
-    vxc_half8 bottom_left, bottom_right;\n\
+    vxc_short8 top_short, bottom_short, dst;\n\
+    vxc_half8  top, bottom, result;\n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, top_left, top_left0, 16);\n\
-\n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, top_right, top_right0, 16);\n\
-\n\
-    coord_in.y = bottom_y_idx;\n\
-    coord_in.x = left_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, bottom_left, bottom_left0, 16);\n\
-\n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, bottom_right, bottom_right0, 16);\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, top,    top_short, 16);\n\
+    _viv_asm(COPY, bottom, bottom_short, 16);\n\
 \n\
     float4 left4;\n\
     float4 right4;\n\
     float4 top4;\n\
     float4 bottom4;\n\
 \n\
-    VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\
-    VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
-    top4        = right4 * x_lerp + left4;\n\
-    VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\
-    VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
-    bottom4      = right4 * x_lerp + left4;\n\
-    bottom4     -= top4;\n\
-    float4 dst4  = bottom4 * y_lerp + top4;\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    top4       = right4 * x_lerp + left4;\n\
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    bottom4    = right4 * x_lerp + left4;\n\
+    bottom4   -= top4;\n\
+    float4 dst4       = bottom4 * y_lerp + top4;\n\
+\n\
     half4 tmp;\n\
     _viv_asm(CONV, tmp, dst4);\n\
-    VXC_DP2x8(top_left, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\
-    _viv_asm(COPY, top_left0, top_left, 16);\n\
-    VXC_WriteImage2DArray(output, coord_out, top_left0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(result, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\
+    _viv_asm(COPY, dst, result, 16);\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void resize_bilinear_F16toU8_DOWN\n\
@@ -29996,84 +31109,50 @@ __kernel void resize_bilinear_F16toU8_DOWN\n\
     float4 left_x_f    = floor(in_x);\n\
     float4 x_lerp      = in_x - left_x_f;\n\
     int4   left_x_idx  = convert_int4(left_x_f);\n\
-    float4 right_x_f   = ceil(in_x);\n\
-    int4   right_x_idx = convert_int4(right_x_f);\n\
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\
     float  top_y_f     = floor(in_y);\n\
     float  y_lerp      = in_y - top_y_f;\n\
     int    top_y_idx   = convert_int(top_y_f);\n\
-    float  bottom_y_f  = ceil(in_y);\n\
-    int    bottom_y_idx= convert_int(bottom_y_f);\n\
-    vxc_short8 top_left0, top_right0;\n\
-    vxc_short8 bottom_left0, bottom_right0;\n\
-    vxc_half8 top_left, top_right;\n\
-    vxc_half8 bottom_left, bottom_right;\n\
+\n\
+    vxc_short8 top_short, bottom_short;\n\
+    vxc_half8  top, bottom;\n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, top_left, top_left0, 16);\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, top,    top_short, 16);\n\
+    _viv_asm(COPY, bottom, bottom_short, 16);\n\
 \n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, top_right, top_right0, 16);\n\
-\n\
-    coord_in.y = bottom_y_idx;\n\
-    coord_in.x = left_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, bottom_left, bottom_left0, 16);\n\
-\n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, bottom_right, bottom_right0, 16);\n\
     float4 left4;\n\
     float4 right4;\n\
     float4 top4;\n\
     float4 bottom4;\n\
-    VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\
-    VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
     top4        = right4 * x_lerp + left4;\n\
-    VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\
-    VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
     bottom4      = right4 * x_lerp + left4;\n\
     bottom4     -= top4;\n\
     float4 dst4  = bottom4 * y_lerp + top4;\n\
@@ -30081,7 +31160,14 @@ __kernel void resize_bilinear_F16toU8_DOWN\n\
     int4 dst     = convert_int4_rte(dst4);\n\
     vxc_uchar8 dst_uchar;\n\
     VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
-    VXC_WriteImage2DArray(output, coord_out, dst_uchar, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_uchar,\n\
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void resize_bilinear_F16toF16_UP\n\
@@ -30104,24 +31190,26 @@ __kernel void resize_bilinear_F16toF16_UP\n\
     float  top_y_f     = floor(in_y);\n\
     float  y_lerp      = in_y - top_y_f;\n\
     int    top_y_idx   = convert_int(top_y_f);\n\
-    float  bottom_y_f  = ceil(in_y);\n\
-    int    bottom_y_idx= convert_int(bottom_y_f);\n\
+\n\
 \n\
     vxc_ushort8 src0, src1, src2, src3, dst0, dst1;\n\
     vxc_half8 top;\n\
     vxc_half8 bottom;\n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
 \n\
-    coord_in.y = bottom_y_idx;\n\
-    VXC_ReadImage2DArray(src2, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src3, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     vxc_ushort8 bitextract_p0;\n\
     vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\
@@ -30131,32 +31219,41 @@ __kernel void resize_bilinear_F16toF16_UP\n\
     VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\
     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\
 \n\
-    do\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 left4;\n\
+    float4 right4;\n\
+    float4 top4;\n\
+    float4 bottom4;\n\
+\n\
+    int loop = depth - 1;\n\
+    while (coord_in.z < loop)\n\
     {\n\
         VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
         VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
         _viv_asm(COPY, top, dst0, 16);\n\
         _viv_asm(COPY, bottom, dst1, 16);\n\
+\n\
+\n\
+        coord_in.w += input_desc.s4;\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
         coord_in.z ++;\n\
-        coord_in.y = top_y_idx;\n\
-        VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_in.y = bottom_y_idx;\n\
-        VXC_ReadImage2DArray(src2, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage2DArray(src3, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        float4 left4;\n\
-        float4 right4;\n\
-        float4 top4;\n\
-        float4 bottom4;\n\
-        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\
-        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
         top4        = right4 * x_lerp + left4;\n\
-        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\
-        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
         bottom4      = right4 * x_lerp + left4;\n\
         bottom4     -= top4;\n\
         float4 dst4  = bottom4 * y_lerp + top4;\n\
@@ -30164,9 +31261,30 @@ __kernel void resize_bilinear_F16toF16_UP\n\
         _viv_asm(CONV, tmp, dst4);\n\
         VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\
         _viv_asm(COPY, dst0, top, 16);\n\
-        VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
-        coord_out.z ++;\n\
-    } while (coord_in.z < depth);\n\
+\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.w += output_desc.s4;\n\
+    }\n\
+\n\
+    VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, top, dst0, 16);\n\
+    _viv_asm(COPY, bottom, dst1, 16);\n\
+\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    top4        = right4 * x_lerp + left4;\n\
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    bottom4      = right4 * x_lerp + left4;\n\
+    bottom4     -= top4;\n\
+    float4 dst4  = bottom4 * y_lerp + top4;\n\
+    half4 tmp;\n\
+    _viv_asm(CONV, tmp, dst4);\n\
+    VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\
+    _viv_asm(COPY, dst0, top, 16);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 "; /* end of resize_bilinear_F16_vx*/
 
@@ -30177,8 +31295,8 @@ _viv_uniform float2 scale_xy;\n\
 _viv_uniform int depth;\n\
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\
 _viv_uniform float dfpScale;\n\
 _viv_uniform float half_pixel_value;\n\
 \n\
@@ -30206,8 +31324,6 @@ __kernel void resize_bilinear_I16toI16_UP\n\
     float  top_y_f     = floor(in_y);\n\
     float  y_lerp      = in_y - top_y_f;\n\
     int    top_y_idx   = convert_int(top_y_f);\n\
-    float  bottom_y_f  = ceil(in_y);\n\
-    int    bottom_y_idx= convert_int(bottom_y_f);\n\
 \n\
     vxc_ushort8 src0, src1, src2, src3, dst0, dst1;\n\
 \n\
@@ -30216,16 +31332,19 @@ __kernel void resize_bilinear_I16toI16_UP\n\
 \n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
 \n\
-    coord_in.y = bottom_y_idx;\n\
-    VXC_ReadImage2DArray(src2, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src3, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     vxc_ushort8 bitextract_p0;\n\
     vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\
@@ -30235,39 +31354,42 @@ __kernel void resize_bilinear_I16toI16_UP\n\
     VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\
     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\
 \n\
-    do\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 left4;\n\
+    float4 right4;\n\
+    float4 top4;\n\
+    float4 bottom4;\n\
+\n\
+    int loop = depth - 1;\n\
+    while (coord_in.z < loop)\n\
     {\n\
         VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
         VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
         _viv_asm(COPY, top, dst0, 16);\n\
         _viv_asm(COPY, bottom, dst1, 16);\n\
-\n\
-        float4 left4;\n\
-        float4 right4;\n\
-        float4 top4;\n\
-        float4 bottom4;\n\
-\n\
+        coord_in.w += input_desc.s4;\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
         coord_in.z ++;\n\
-        coord_in.y = top_y_idx;\n\
-        VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-        coord_in.y = bottom_y_idx;\n\
-        VXC_ReadImage2DArray(src2, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage2DArray(src3, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
-        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
         top4        = right4 * x_lerp + left4;\n\
 \n\
         VXC_DP4x4(left4, bottom, bottom, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
         VXC_DP4x4(right4, bottom, bottom, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
         bottom4      = right4 * x_lerp + left4;\n\
         bottom4     -= top4;\n\
         float4 dst4  = bottom4 * y_lerp + top4;\n\
@@ -30275,10 +31397,30 @@ __kernel void resize_bilinear_I16toI16_UP\n\
         int4 dst     = convert_int4_rte(dst4);\n\
 \n\
         VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
-        VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 \n\
-        coord_out.z ++;\n\
-    } while (coord_in.z < depth);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.w += output_desc.s4;\n\
+    }\n\
+\n\
+    VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, top, dst0, 16);\n\
+    _viv_asm(COPY, bottom, dst1, 16);\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    top4        = right4 * x_lerp + left4;\n\
+    VXC_DP4x4(left4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    bottom4      = right4 * x_lerp + left4;\n\
+    bottom4     -= top4;\n\
+    float4 dst4  = bottom4 * y_lerp + top4;\n\
+    dst4         = dst4 * dfpScale;\n\
+    int4 dst     = convert_int4_rte(dst4);\n\
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
 }\n\
 \n\
 __kernel void resize_bilinear_I16toI16_DOWN\n\
@@ -30297,104 +31439,68 @@ __kernel void resize_bilinear_I16toI16_DOWN\n\
     float4 left_x_f    = floor(in_x);\n\
     float4 x_lerp      = in_x - left_x_f;\n\
     int4   left_x_idx  = convert_int4(left_x_f);\n\
-    float4 right_x_f   = ceil(in_x);\n\
-    int4   right_x_idx = convert_int4(right_x_f);\n\
-\n\
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\
-\n\
     float  top_y_f     = floor(in_y);\n\
     float  y_lerp      = in_y - top_y_f;\n\
     int    top_y_idx   = convert_int(top_y_f);\n\
-    float  bottom_y_f  = ceil(in_y);\n\
-    int    bottom_y_idx= convert_int(bottom_y_f);\n\
-\n\
-    vxc_short8 top_left, top_right;\n\
-    vxc_short8 bottom_left, bottom_right;\n\
 \n\
+    vxc_short8 top, bottom, result;\n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.y = bottom_y_idx;\n\
-    coord_in.x = left_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     float4 left4;\n\
     float4 right4;\n\
     float4 top4;\n\
     float4 bottom4;\n\
 \n\
-    VXC_DP4x4(left4, top_left, top_left, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
-    VXC_DP4x4(right4, top_right, top_right, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
-\n\
-    right4      -= left4;\n\
+    VXC_DP4x4(left4, top, top, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, top, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
     top4        = right4 * x_lerp + left4;\n\
 \n\
-    VXC_DP4x4(left4, bottom_left, bottom_left, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
-    VXC_DP4x4(right4, bottom_right, bottom_right, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
-\n\
-    right4      -= left4;\n\
+    VXC_DP4x4(left4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
     bottom4      = right4 * x_lerp + left4;\n\
-\n\
     bottom4     -= top4;\n\
     float4 dst4  = bottom4 * y_lerp + top4;\n\
-\n\
     dst4         = dst4 * dfpScale;\n\
-\n\
     int4 dst     = convert_int4_rte(dst4);\n\
 \n\
-    VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
-    VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 "; /* end of resize_bilinear_I16_vx*/
@@ -30406,8 +31512,8 @@ _viv_uniform float2 scale_xy;\n\
 _viv_uniform int depth;\n\
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\
 _viv_uniform float dfpScale;\n\
 _viv_uniform float half_pixel_value;\n\
 \n\
@@ -30435,8 +31541,6 @@ __kernel void resize_bilinear_I8toI8_UP\n\
     float  top_y_f     = floor(in_y);\n\
     float  y_lerp      = in_y - top_y_f;\n\
     int    top_y_idx   = convert_int(top_y_f);\n\
-    float  bottom_y_f  = ceil(in_y);\n\
-    int    bottom_y_idx= convert_int(bottom_y_f);\n\
 \n\
     vxc_uchar16 src0, src1, dst0, dst1;\n\
 \n\
@@ -30445,12 +31549,15 @@ __kernel void resize_bilinear_I8toI8_UP\n\
 \n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
 \n\
-    coord_in.y = bottom_y_idx;\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     vxc_ushort8 bitextract_p0;\n\
     vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\
@@ -30460,37 +31567,42 @@ __kernel void resize_bilinear_I8toI8_UP\n\
     VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\
     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\
 \n\
-    do\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 left4;\n\
+    float4 right4;\n\
+    float4 top4;\n\
+    float4 bottom4;\n\
+\n\
+    int loop = depth - 1;\n\
+    while (coord_in.z < loop)\n\
     {\n\
         VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
         VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
         _viv_asm(COPY, top, dst0, 16);\n\
         _viv_asm(COPY, bottom, dst1, 16);\n\
 \n\
+        coord_in.w += input_desc.s4;\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
         coord_in.z ++;\n\
-        coord_in.y = top_y_idx;\n\
-        VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        coord_in.y = bottom_y_idx;\n\
-        VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-        float4 left4;\n\
-        float4 right4;\n\
-        float4 top4;\n\
-        float4 bottom4;\n\
 \n\
         VXC_DP4x4(left4, top, top, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
         VXC_DP4x4(right4, top, top, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
 \n\
         top4        = right4 * x_lerp + left4;\n\
 \n\
         VXC_DP4x4(left4, bottom, bottom, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
         VXC_DP4x4(right4, bottom, bottom, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
 \n\
         bottom4      = right4 * x_lerp + left4;\n\
         bottom4     -= top4;\n\
@@ -30498,10 +31610,31 @@ __kernel void resize_bilinear_I8toI8_UP\n\
         dst4         = dst4 * dfpScale;\n\
         int4 dst     = convert_int4_rte(dst4);\n\
         VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
-        VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 \n\
-        coord_out.z ++;\n\
-    } while (coord_in.z < depth);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.w += output_desc.s4;\n\
+    }\n\
+\n\
+    VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, top, dst0, 16);\n\
+    _viv_asm(COPY, bottom, dst1, 16);\n\
+    VXC_DP4x4(left4, top, top, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, top, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    top4        = right4 * x_lerp + left4;\n\
+    VXC_DP4x4(left4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    bottom4      = right4 * x_lerp + left4;\n\
+    bottom4     -= top4;\n\
+    float4 dst4  = bottom4 * y_lerp + top4;\n\
+    dst4         = dst4 * dfpScale;\n\
+    int4 dst     = convert_int4_rte(dst4);\n\
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void resize_bilinear_I8toI8_DOWN\n\
@@ -30513,98 +31646,55 @@ __kernel void resize_bilinear_I8toI8_DOWN\n\
     )\n\
 {\n\
     int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
     int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\
     float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\
-\n\
     float4 left_x_f    = floor(in_x);\n\
     float4 x_lerp      = in_x - left_x_f;\n\
     int4   left_x_idx  = convert_int4(left_x_f);\n\
-    float4 right_x_f   = ceil(in_x);\n\
-    int4   right_x_idx = convert_int4(right_x_f);\n\
-\n\
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\
-\n\
     float  top_y_f     = floor(in_y);\n\
     float  y_lerp      = in_y - top_y_f;\n\
     int    top_y_idx   = convert_int(top_y_f);\n\
-    float  bottom_y_f  = ceil(in_y);\n\
-    int    bottom_y_idx= convert_int(bottom_y_f);\n\
-\n\
-    vxc_char16 top_left, top_right;\n\
-    vxc_char16 bottom_left, bottom_right;\n\
-\n\
+    vxc_char16 top, bottom, result;\n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.y = bottom_y_idx;\n\
-    coord_in.x = left_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     float4 left4;\n\
     float4 right4;\n\
     float4 top4;\n\
     float4 bottom4;\n\
 \n\
-    VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
-    VXC_DP4x4(right4, top_right, top_right, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
-\n\
-    right4      -= left4;\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
     top4        = right4 * x_lerp + left4;\n\
 \n\
-    VXC_DP4x4(left4, bottom_left, bottom_left, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
-    VXC_DP4x4(right4, bottom_right, bottom_right, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\
-\n\
-    right4      -= left4;\n\
+    VXC_DP4x4(left4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
     bottom4      = right4 * x_lerp + left4;\n\
 \n\
     bottom4     -= top4;\n\
@@ -30614,21 +31704,26 @@ __kernel void resize_bilinear_I8toI8_DOWN\n\
 \n\
     int4 dst     = convert_int4_rte(dst4);\n\
 \n\
-    VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
-    VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 "; /* end of resize_bilinear_I8_vx*/
 
 static const char resize_bilinear_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-_viv_uniform VXC_512Bits uniU8SubZPtoFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
 _viv_uniform float2 scale_xy;\n\
 _viv_uniform int depth;\n\
 _viv_uniform int input_ZP;\n\
 _viv_uniform float uint8Scale;\n\
 _viv_uniform float output_ZP;\n\
-_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\
 _viv_uniform float half_pixel_value;\n\
@@ -30647,69 +31742,36 @@ __kernel void resize_bilinear_U8toF16_DOWN\n\
     float4 left_x_f    = floor(in_x);\n\
     float4 x_lerp      = in_x - left_x_f;\n\
     int4   left_x_idx  = convert_int4(left_x_f);\n\
-    float4 right_x_f   = ceil(in_x);\n\
-    int4   right_x_idx = convert_int4(right_x_f);\n\
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\
     float  top_y_f     = floor(in_y);\n\
     float  y_lerp      = in_y - top_y_f;\n\
     int    top_y_idx   = convert_int(top_y_f);\n\
-    float  bottom_y_f  = ceil(in_y);\n\
-    int    bottom_y_idx= convert_int(bottom_y_f);\n\
-    vxc_uchar16 top_left, top_right;\n\
-    vxc_uchar16 bottom_left, bottom_right;\n\
+    vxc_uchar16 top, bottom;\n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.y = bottom_y_idx;\n\
-    coord_in.x = left_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     float4 left4;\n\
     float4 right4;\n\
@@ -30718,16 +31780,12 @@ __kernel void resize_bilinear_U8toF16_DOWN\n\
 \n\
     unsigned char inputZP;\n\
     _viv_asm(COPY, inputZP, input_ZP, 4);\n\
-    VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\
-    VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\
-\n\
-    right4      -= left4;\n\
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     top4        = right4 * x_lerp + left4;\n\
 \n\
-    VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\
-    VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\
-\n\
-    right4      -= left4;\n\
+    VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     bottom4      = right4 * x_lerp + left4;\n\
 \n\
     bottom4     -= top4;\n\
@@ -30741,7 +31799,12 @@ __kernel void resize_bilinear_U8toF16_DOWN\n\
     vxc_short8 dst_short;\n\
     _viv_asm(COPY, dst_short, dst, 16);\n\
 \n\
-    VXC_WriteImage2DArray(output, coord_out, dst_short.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short.s0246,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void resize_bilinear_U8toU8_UP\n\
@@ -30768,8 +31831,6 @@ __kernel void resize_bilinear_U8toU8_UP\n\
     float  top_y_f     = floor(in_y);\n\
     float  y_lerp      = in_y - top_y_f;\n\
     int    top_y_idx   = convert_int(top_y_f);\n\
-    float  bottom_y_f  = ceil(in_y);\n\
-    int    bottom_y_idx= convert_int(bottom_y_f);\n\
 \n\
     vxc_uchar16 src0, src1;\n\
 \n\
@@ -30778,12 +31839,15 @@ __kernel void resize_bilinear_U8toU8_UP\n\
 \n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
 \n\
-    coord_in.y = bottom_y_idx;\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     vxc_ushort8 bitextract_p0;\n\
     vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\
@@ -30793,46 +31857,67 @@ __kernel void resize_bilinear_U8toU8_UP\n\
     VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\
     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\
 \n\
-    do\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 left4;\n\
+    float4 right4;\n\
+    float4 top4;\n\
+    float4 bottom4;\n\
+\n\
+    int loop = depth - 1;\n\
+    while (coord_in.z < loop)\n\
     {\n\
         VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
         VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
+        coord_in.w += input_desc.s4;\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
         coord_in.z ++;\n\
-        coord_in.y = top_y_idx;\n\
-        VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        coord_in.y = bottom_y_idx;\n\
-        VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-        float4 left4;\n\
-        float4 right4;\n\
-        float4 top4;\n\
-        float4 bottom4;\n\
-\n\
         unsigned char inputZP;\n\
         _viv_asm(COPY, inputZP, input_ZP, 4);\n\
-        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\
-        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
-\n\
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
         top4        = right4 * x_lerp + left4;\n\
 \n\
         VXC_DP4x4(left4, bottom, inputZP, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
         VXC_DP4x4(right4, bottom, bottom, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
-\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
         bottom4      = right4 * x_lerp + left4;\n\
         bottom4     -= top4;\n\
         float4 dst4  = bottom4 * y_lerp + top4;\n\
         dst4         = dst4 * uint8Scale + output_ZP;\n\
         int4 dst     = convert_int4_rte(dst4);\n\
         VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
-        VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.w += output_desc.s4;\n\
+    }\n\
 \n\
-        coord_out.z ++;\n\
-    } while (coord_in.z < depth);\n\
+    VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    unsigned char inputZP;\n\
+    _viv_asm(COPY, inputZP, input_ZP, 4);\n\
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    top4        = right4 * x_lerp + left4;\n\
+\n\
+    VXC_DP4x4(left4, bottom, inputZP, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    bottom4      = right4 * x_lerp + left4;\n\
+    bottom4     -= top4;\n\
+    float4 dst4  = bottom4 * y_lerp + top4;\n\
+    dst4         = dst4 * uint8Scale + output_ZP;\n\
+    int4 dst     = convert_int4_rte(dst4);\n\
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void resize_bilinear_U8toU8_DOWN\n\
@@ -30849,69 +31934,36 @@ __kernel void resize_bilinear_U8toU8_DOWN\n\
     float4 left_x_f    = floor(in_x);\n\
     float4 x_lerp      = in_x - left_x_f;\n\
     int4   left_x_idx  = convert_int4(left_x_f);\n\
-    float4 right_x_f   = ceil(in_x);\n\
-    int4   right_x_idx = convert_int4(right_x_f);\n\
     float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\
     float  top_y_f     = floor(in_y);\n\
     float  y_lerp      = in_y - top_y_f;\n\
     int    top_y_idx   = convert_int(top_y_f);\n\
-    float  bottom_y_f  = ceil(in_y);\n\
-    int    bottom_y_idx= convert_int(bottom_y_f);\n\
-    vxc_uchar16 top_left, top_right;\n\
-    vxc_uchar16 bottom_left, bottom_right;\n\
+    vxc_uchar16 top, bottom, result;\n\
     int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.y = bottom_y_idx;\n\
-    coord_in.x = left_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_in.x = right_x_idx.x;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.y;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.z;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = right_x_idx.w;\n\
-    VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     float4 left4;\n\
     float4 right4;\n\
@@ -30920,27 +31972,28 @@ __kernel void resize_bilinear_U8toU8_DOWN\n\
 \n\
     unsigned char inputZP;\n\
     _viv_asm(COPY, inputZP, input_ZP, 4);\n\
-    VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\
-    VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\
-\n\
-    right4      -= left4;\n\
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     top4        = right4 * x_lerp + left4;\n\
 \n\
-    VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\
-    VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\
-\n\
-    right4      -= left4;\n\
+    VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     bottom4      = right4 * x_lerp + left4;\n\
 \n\
     bottom4     -= top4;\n\
     float4 dst4  = bottom4 * y_lerp + top4;\n\
 \n\
     dst4         = dst4 * uint8Scale + output_ZP;\n\
-\n\
     int4 dst     = convert_int4_rte(dst4);\n\
 \n\
-    VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
-    VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 "; /* end of resize_bilinear_U8_vx*/
 
@@ -31082,7 +32135,8 @@ __kernel void resize_bilinear_U8toU8_UP_opt\n\
     baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
     _viv_asm(MOV, coord_out.w, baseAddr);\n\
 \n\
-    do\n\
+    int loop = depth - 1;\n\
+    while (coord_in.z < loop)\n\
     {\n\
         VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
         VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\
@@ -31101,8 +32155,17 @@ __kernel void resize_bilinear_U8toU8_UP_opt\n\
                 VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
         coord_out.w += output_desc.s4;\n\
 \n\
-        coord_out.z ++;\n\
-    } while (coord_out.z < depth);\n\
+        coord_in.z ++;\n\
+    }\n\
+\n\
+    VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\
+    vxc_uchar16 dst;\n\
+    VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom,\n\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\
+            VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
 }\n\
 \n\
 #endif"; /* end of resize_bilinear_U8_opt_vx*/
@@ -31137,18 +32200,30 @@ __kernel void resize_nearest_F16toF16\n\
     vxc_short8 src;\n\
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.y;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.z;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.w;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 _viv_uniform VXC_512Bits uniGetExtractData_2x8;\n\
@@ -31165,18 +32240,29 @@ __kernel void resize_nearest_F16toF16_op\n\
     vxc_ushort8 src0, src1, dst;\n\
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    //in_x_idx = in_x_idx - in_x_idx.xxxx;\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
     vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);\n\
     vxc_ushort8 input_idx;\n\
     _viv_asm(COPY, input_idx, in_x_idx, 16);\n\
     VXC_DP2x8(mask, input_idx, input_idx, \\\n\
     VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);\n\
     VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 _viv_uniform VXC_512Bits uniConvertI8toI8_2x8;\n\
@@ -31193,19 +32279,31 @@ __kernel void resize_nearest_I8toI8\n\
     vxc_char16 src;\n\
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.y;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.z;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.w;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
     VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void resize_nearest_I8toI8_op\n\
@@ -31222,8 +32320,14 @@ __kernel void resize_nearest_I8toI8_op\n\
     vxc_char16 dst;\n\
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
     vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\
     vxc_ushort8 input_idx;\n\
     _viv_asm(COPY, input_idx, in_x_idx, 16);\n\
@@ -31232,7 +32336,13 @@ __kernel void resize_nearest_I8toI8_op\n\
     VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, dst, dst0, 8);\n\
     VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void resize_nearest_U8toU8\n\
@@ -31248,22 +32358,34 @@ __kernel void resize_nearest_U8toU8\n\
     vxc_uchar16 src;\n\
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.y;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.z;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.w;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
     vxc_ushort8 multiplier;\n\
     _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\
     VXC_DP2x8(src, src, multiplier, \\\n\
     VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);\n\
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void resize_nearest_U8toU8_op\n\
@@ -31279,8 +32401,14 @@ __kernel void resize_nearest_U8toU8_op\n\
     vxc_uchar16 src0, dst;\n\
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
     vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\
     vxc_ushort8 input_idx;\n\
     _viv_asm(COPY, input_idx, in_x_idx, 16);\n\
@@ -31289,7 +32417,13 @@ __kernel void resize_nearest_U8toU8_op\n\
     vxc_ushort8 multiplier;\n\
     _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\
     VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);\n\
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void resize_nearest_I16toI16\n\
@@ -31305,19 +32439,32 @@ __kernel void resize_nearest_I16toI16\n\
     vxc_short8 src;\n\
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\
 \n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.y;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.z;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.x = in_x_idx.w;\n\
-    VXC_ReadImage2DArray(src, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
     VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void resize_nearest_I16toI16_op\n\
@@ -31333,10 +32480,16 @@ __kernel void resize_nearest_I16toI16_op\n\
     vxc_ushort8 src0, src1, dst0;\n\
     vxc_short8 dst;\n\
     int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\
-    VXC_ReadImage2DArray(src0, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, \\\n\
-    VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     //in_x_idx = in_x_idx - in_x_idx.xxxx;\n\
     vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);\n\
@@ -31346,7 +32499,13 @@ __kernel void resize_nearest_I16toI16_op\n\
     VXC_BitExtract(dst0, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, dst, dst0, 8);\n\
     VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 "; /* end of resize_nearest_vx*/
 
@@ -31673,6 +32832,142 @@ __kernel void select_I8_U8_U8toU8_2D(\n\
 }\n\
 "; /* end of select_vx*/
 
+static const char space2depth_internal_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniExtractEvenUint8Stride2_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddUint8Stride2_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4;\n\
+_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4;\n\
+\n\
+_viv_uniform int input_depth;\n\
+\n\
+#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT(src0_type_name, src1_type_name, read_type) \\\n\
+__kernel void space2depth_internal_##src0_type_name##to##src1_type_name( \\\n\
+    image2d_array_t input, \\\n\
+    image2d_array_t output, \\\n\
+    int block_size_x, \\\n\
+    int block_size_y \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+    read_type src; \\\n\
+    VXC_ReadImage2DArray(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    ushort stride_x = (ushort)block_size_x; \\\n\
+    ushort stride_y = (ushort)block_size_y; \\\n\
+    ushort sidx = (ushort)gidx; \\\n\
+    ushort sidy = (ushort)gidy; \\\n\
+    ushort tmpX = sidx % stride_x; \\\n\
+    ushort tmpY = sidy % stride_y; \\\n\
+    int tmpId0 = tmpX; \\\n\
+    int tmpId1 = tmpY; \\\n\
+    int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0); \\\n\
+    coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz; \\\n\
+    VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT(U8, U8, vxc_uchar16)\n\
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I8, I8, vxc_char16)\n\
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I16, I16, vxc_short8)\n\
+\n\
+__kernel void space2depth_internal_F16toF16(\n\
+    image2d_array_t input,\n\
+    image2d_array_t output,\n\
+    int block_size_x,\n\
+    int block_size_y\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    vxc_short8 data, imgVal0;\n\
+    VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    ushort stride_x = (ushort)block_size_x;\n\
+    ushort stride_y = (ushort)block_size_y;\n\
+    ushort sidx = (ushort)gidx;\n\
+    ushort sidy = (ushort)gidy;\n\
+    ushort tmpX = sidx % stride_x;\n\
+    ushort tmpY = sidy % stride_y;\n\
+    int tmpId0 = tmpX;\n\
+    int tmpId1 = tmpY;\n\
+    int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0);\n\
+    coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz;\n\
+\n\
+    VXC_WriteImage2DArray(output, coord_out, data, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \\\n\
+__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \\\n\
+    image2d_array_t input, \\\n\
+    image2d_array_t output, \\\n\
+    int block_size_x, \\\n\
+    int block_size_y \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+ \\\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+    int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \\\n\
+    int out_d1; \\\n\
+    read_type imageData; \\\n\
+    write_type  imgVal0, imgVal1; \\\n\
+ \\\n\
+    VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                     VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    out_d1 = gidz + input_depth; \\\n\
+ \\\n\
+    VXC_DP2x8(imgVal0, imageData, imageData,\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractEvenUint8Stride2_2x8); \\\n\
+    VXC_DP2x8(imgVal1, imageData, imageData,\\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddUint8Stride2_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_out.z = out_d1; \\\n\
+    VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(I8, I8, vxc_char16, vxc_char16)\n\
+\n\
+#define SPACE2DEPTH_INTERNAL_16BITS_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \\\n\
+__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \\\n\
+    image2d_array_t input, \\\n\
+    image2d_array_t output, \\\n\
+    int block_size_x, \\\n\
+    int block_size_y \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+ \\\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+    int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \\\n\
+    int out_d1; \\\n\
+    read_type imageData; \\\n\
+    write_type  imgVal0, imgVal1; \\\n\
+ \\\n\
+    VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    out_d1 = gidz + input_depth; \\\n\
+    VXC_DP4x4(imgVal0, imageData, imageData, \\\n\
+                 VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractEvenFp16Stride2_4x4); \\\n\
+    VXC_DP4x4(imgVal1, imageData, imageData, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractOddFp16Stride2_4x4); \\\n\
+    VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_out.z = out_d1; \\\n\
+    VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SPACE2DEPTH_INTERNAL_16BITS_X2Y1(I16, I16, vxc_short8, vxc_short8)\n\
+SPACE2DEPTH_INTERNAL_16BITS_X2Y1(F16, F16, vxc_short8, vxc_short8)"; /* end of space2depth_internal_vx*/
+
 static const char swish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform float logE;\n\
@@ -33164,6 +34459,151 @@ __kernel void upsample_U8_U8to_F16_2D\n\
 }\n\
 "; /* end of upsample_U8_vx*/
 
+static const char upsamplescale_vx[] = "\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertDatatoF32_4x4;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float tail;\n\
+\n\
+#define UPSAMPLE_SCALETO_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type, conv_func) \\\n\
+    __kernel void upsamplescale_##src_name##to##dst_name( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              stride, \\\n\
+                 float            scale) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    read_type  read_val; \\\n\
+    src_type   src_val; \\\n\
+    dst_type   dst_val; \\\n\
+    write_type write_val; \\\n\
+    VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src_val, read_val, 16); \\\n\
+    coord.xy *= stride; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord.w, baseAddr); \\\n\
+    float4 data; \\\n\
+    VXC_DP4x4(data, src_val, src_val, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertDatatoF32_4x4); \\\n\
+    data = data * output_scale + tail; \\\n\
+    _viv_asm(conv_func, dst_val, data); \\\n\
+    _viv_asm(COPY, write_val, dst_val, 16); \\\n\
+    int4 coord_out = coord; \\\n\
+    for (int y = 0; y < stride; y++) \\\n\
+    { \\\n\
+        coord_out.x = coord.x; \\\n\
+        for (int x = 0; x < stride; ) \\\n\
+        { \\\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, write_val, \\\n\
+                VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); \\\n\
+            x++; \\\n\
+            coord_out.x ++; \\\n\
+        } \\\n\
+        coord_out.y ++; \\\n\
+    } \\\n\
+}\n\
+\n\
+UPSAMPLE_SCALETO_FUN(F16, F16,  vxc_short8,  vxc_half8,   half4,  short4, CONV)\n\
+UPSAMPLE_SCALETO_FUN(F16, I16,  vxc_short8,  vxc_half8,   int4,   short4, CONV_RTE)\n\
+UPSAMPLE_SCALETO_FUN(F16, I8,   vxc_short8,  vxc_half8,   int4,   char4,  CONV_RTE)\n\
+UPSAMPLE_SCALETO_FUN(F16, U8,   vxc_short8,  vxc_half8,   int4,   uchar4, CONV_RTE)\n\
+UPSAMPLE_SCALETO_FUN(I16, I16,  vxc_short8,  vxc_short8,  int4,   short4, CONV_RTE)\n\
+UPSAMPLE_SCALETO_FUN(I16, F16,  vxc_short8,  vxc_short8,  half4,  short4, CONV)\n\
+UPSAMPLE_SCALETO_FUN(I8,  I8,   vxc_char16,  vxc_char16,  int4,   char4,  CONV_RTE)\n\
+UPSAMPLE_SCALETO_FUN(I8,  F16,  vxc_short8,  vxc_short8,  half4,  short4, CONV)\n\
+UPSAMPLE_SCALETO_FUN(U8,  U8,   vxc_uchar16, vxc_uchar16, int4,   uchar4, CONV_RTE)\n\
+UPSAMPLE_SCALETO_FUN(U8,  F16,  vxc_short8,  vxc_short8,  half4,  short4, CONV)\n\
+\n\
+"; /* end of upsamplescale_vx*/
+
+static const char upsamplescale_k2_vx[] = "\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniUpScale2X_lo_2x8;\n\
+_viv_uniform VXC_512Bits uniUpScale2X_hi_2x8;\n\
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\
+\n\
+#define UPSAMPLE_SCALETO8B_FUN(src_name, dst_name, read_type, src_type, dst_type) \\\n\
+    __kernel void upsamplescale_##src_name##to##dst_name##_K2( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              stride, \\\n\
+                 float            scale) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    read_type  read_val; \\\n\
+    src_type   src_val; \\\n\
+    dst_type   dst_val; \\\n\
+    VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src_val, read_val, 16); \\\n\
+    coord.xy <<= 1; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord.w, baseAddr); \\\n\
+    vxc_ushort8 multiplier; \\\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
+    VXC_DP2x8(dst_val, src_val, multiplier, \\\n\
+          VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_lo_2x8); \\\n\
+    VXC_DP2x8(dst_val, src_val, multiplier, \\\n\
+          VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_hi_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
+    coord.y ++; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+\n\
+UPSAMPLE_SCALETO8B_FUN(F16, I8,  vxc_short8,  vxc_half8,   vxc_char16)\n\
+UPSAMPLE_SCALETO8B_FUN(F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16)\n\
+UPSAMPLE_SCALETO8B_FUN(I8,  I8,  vxc_char16,  vxc_char16,  vxc_char16)\n\
+UPSAMPLE_SCALETO8B_FUN(U8,  U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16)\n\
+\n\
+#define UPSAMPLE_SCALETO16B_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type) \\\n\
+    __kernel void upsamplescale_##src_name##to##dst_name##_K2( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              stride, \\\n\
+                 float            scale) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    read_type  read_val; \\\n\
+    src_type   src_val; \\\n\
+    dst_type   dst0_val; \\\n\
+    dst_type   dst1_val; \\\n\
+    write_type write_val; \\\n\
+    VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src_val, read_val, 16); \\\n\
+    coord.xy <<= 1; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord.w, baseAddr); \\\n\
+    vxc_ushort8 multiplier; \\\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
+    VXC_DP2x8(dst0_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_lo_2x8); \\\n\
+    VXC_DP2x8(dst1_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_hi_2x8); \\\n\
+    _viv_asm(COPY, write_val, dst0_val, 16); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    coord.y ++; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, write_val, dst1_val, 16); \\\n\
+    coord.xy = coord.xy + (int2)(8, -1); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    coord.y ++; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+UPSAMPLE_SCALETO16B_FUN(F16, F16,  vxc_short8,  vxc_half8,   vxc_half8,  vxc_short8)\n\
+UPSAMPLE_SCALETO16B_FUN(F16, I16,  vxc_short8,  vxc_half8,   vxc_short8, vxc_short8)\n\
+UPSAMPLE_SCALETO16B_FUN(I8,  F16,  vxc_char16,  vxc_char16,  vxc_half8,  vxc_short8)\n\
+UPSAMPLE_SCALETO16B_FUN(U8,  F16,  vxc_uchar16, vxc_uchar16, vxc_half8,  vxc_short8)\n\
+UPSAMPLE_SCALETO16B_FUN(I16, F16,  vxc_short8,  vxc_short8,  vxc_half8,  vxc_short8)\n\
+UPSAMPLE_SCALETO16B_FUN(I16, I16,  vxc_short8,  vxc_short8,  vxc_short8, vxc_short8)\n\
+"; /* end of upsamplescale_k2_vx*/
+
 static const char vsi_nn_kernel_axis_aligned_bbox_transform_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 __kernel void vxcAxis_aligned_bbox_transform(\n\
@@ -33184,119 +34624,6 @@ __kernel void vxcBox_with_nms_limit(\n\
 }\n\
 "; /* end of vsi_nn_kernel_box_with_nms_limit_vx*/
 
-static const char vsi_nn_kernel_crop_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-//-----------------------------------------------tensor crop-------------------------------\n\
-__kernel void vxcTensorCrop_Int16(\n\
-    __read_only image2d_array_t   input,\n\
-    __write_only image2d_array_t  output,\n\
-        int offset0,\n\
-        int offset1,\n\
-        int offset2)\n\
-{\n\
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    vxc_ushort8 src0, src1, src2, src3;\n\
-\n\
-    VXC_ReadImage2DArray(src0, input,  coord_in, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input,  coord_in, VXC_5BITOFFSET_XY(0, 1),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src2, input,  coord_in, VXC_5BITOFFSET_XY(0, 2),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src3, input,  coord_in, VXC_5BITOFFSET_XY(0, 3),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\\\n\
-        - offset1, get_global_id(2) - offset2, 0);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.y ++;\n\
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.y ++;\n\
-    VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.y ++;\n\
-    VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void vxcTensorCrop_Int8(\n\
-    __read_only image2d_array_t   input,\n\
-    __write_only image2d_array_t  output,\n\
-        int offset0,\n\
-        int offset1,\n\
-        int offset2)\n\
-{\n\
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_uchar16 src0, src1, src2, src3;\n\
-\n\
-    VXC_ReadImage2DArray(src0, input,  coord_in, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input,  coord_in, VXC_5BITOFFSET_XY(0, 1),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src2, input,  coord_in, VXC_5BITOFFSET_XY(0, 2),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src3, input,  coord_in, VXC_5BITOFFSET_XY(0, 3),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1) - offset1,\\\n\
-        get_global_id(2) - offset2, 0);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.y ++;\n\
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.y ++;\n\
-    VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.y ++;\n\
-    VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;\n\
-\n\
-__kernel void vxcTensorCrop_Int16_Fp16(\n\
-    __read_only image2d_array_t   input,\n\
-    __write_only image2d_array_t  output,\n\
-        int offset0,\n\
-        int offset1,\n\
-        int offset2)\n\
-{\n\
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    vxc_short8 src0, src1, src2, src3;\n\
-\n\
-    VXC_ReadImage2DArray(src0, input,  coord_in, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input,  coord_in, VXC_5BITOFFSET_XY(0, 1),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src2, input,  coord_in, VXC_5BITOFFSET_XY(0, 2),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src3, input,  coord_in, VXC_5BITOFFSET_XY(0, 3),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\\\n\
-        - offset1, get_global_id(2) - offset2, 0);\n\
-\n\
-    vxc_half8 dst0, dst1, dst2, dst3;\n\
-    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt16toFp16_2x8);\n\
-    VXC_DP2x8(dst1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt16toFp16_2x8);\n\
-    VXC_DP2x8(dst2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt16toFp16_2x8);\n\
-    VXC_DP2x8(dst3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt16toFp16_2x8);\n\
-\n\
-    vxc_short8 out0, out1, out2, out3;\n\
-    _viv_asm(COPY, out0, dst0, 16);\n\
-    _viv_asm(COPY, out1, dst1, 16);\n\
-    _viv_asm(COPY, out2, dst2, 16);\n\
-    _viv_asm(COPY, out3, dst3, 16);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord_out, out0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.y ++;\n\
-    VXC_WriteImage2DArray(output, coord_out, out1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.y ++;\n\
-    VXC_WriteImage2DArray(output, coord_out, out2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.y ++;\n\
-    VXC_WriteImage2DArray(output, coord_out, out3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of vsi_nn_kernel_crop_vx*/
-
 static const char vsi_nn_kernel_detection_postprocess_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 __kernel void vxcDetection_postprocess(\n\
@@ -33352,71 +34679,6 @@ __kernel void vxcExtra_ending_u8(\n\
 }\n\
 "; /* end of vsi_nn_kernel_extra_ending_vx*/
 
-static const char vsi_nn_kernel_fullconnect2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int loopNum;\n\
-_viv_uniform VXC_512Bits uniMulAcc_16x1;\n\
-__kernel void vsi_nn_kernel_fullconnect2(\n\
-     __read_only image2d_array_t   input,\n\
-     __read_only image2d_array_t   weight,\n\
-     __read_only image2d_array_t   bias,\n\
-     __write_only image2d_array_t  output)\n\
-{\n\
-    int4 coord_in = (int4)(16, get_global_id(0), get_global_id(1), 0);\n\
-    int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 v0, v1, v2, v3, v4, v5, v6, v7;\n\
-    vxc_half8 i0, i1, i2, i3;\n\
-    vxc_half8 w0, w1, w2, w3;\n\
-    float4 sum = 0;\n\
-    float dst = 0;\n\
-    dst = read_imagef(bias, coord_in.ywww).x;\n\
-    do\n\
-    {\n\
-        VXC_ReadImage(v0, input,  coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, i0, v0, 16);\n\
-        VXC_ReadImage(v1, weight, coord_in.xy, VXC_5BITOFFSET_XY(-16, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, w0, v1, 16);\n\
-        VXC_ReadImage(v2, input,  coord_in.xz, VXC_5BITOFFSET_XY(-8, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, i1, v2, 16);\n\
-        VXC_ReadImage(v3, weight, coord_in.xy, VXC_5BITOFFSET_XY(-8, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, w1, v3, 16);\n\
-        VXC_ReadImage(v4, input,  coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, i2, v4, 16);\n\
-        VXC_ReadImage(v5, weight, coord_in.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, w2, v5, 16);\n\
-        VXC_ReadImage(v6, input,  coord_in.xz, VXC_5BITOFFSET_XY(8, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, i3, v6, 16);\n\
-        VXC_ReadImage(v7, weight, coord_in.xy, VXC_5BITOFFSET_XY(8, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, w3, v7, 16);\n\
-\n\
-        coord_in.x += 32;\n\
-\n\
-        VXC_DP16x1(sum, i0, w0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\
-        VXC_DP16x1(sum, i1, w1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\
-        VXC_DP16x1(sum, i2, w2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\
-        VXC_DP16x1(sum, i3, w3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\
-\n\
-        float4 tmp = {1, 1, 1, 1};\n\
-        dst = dst + dot(sum, tmp);\n\
-\n\
-    } while (coord_in.x < loopNum);\n\
-\n\
-    vxc_half v;\n\
-    _viv_asm(CONV, v, dst);\n\
-    _viv_asm(COPY, v0, v, 16);\n\
-    VXC_WriteImage(output, coord_out.xy, v0, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of vsi_nn_kernel_fullconnect2_vx*/
-
 static const char vsi_nn_kernel_generate_proposals_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 __kernel void vxcGenerate_proposals(\n\
@@ -35071,426 +36333,6 @@ __kernel void GrayScaletoTensor_UInt8\n\
 }\n\
 "; /* end of vsi_nn_kernel_imageprocess_5_vx*/
 
-static const char vsi_nn_kernel_layernormalize_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-/**************************layernorm float16***********************************/\n\
-_viv_uniform int width;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\
-\n\
-__kernel void vxcLayerNorm(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t output,\n\
-              float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
-    vxc_short8 src0, src1;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
-    {\n\
-        vxc_half8  val0_h;\n\
-        _viv_asm(COPY, val0_h, src0, 16);\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniFp16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr += sumsqr.y;\n\
-    }\n\
-    vxc_float mean;\n\
-    mean = sum * dimRatio;\n\
-    vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_float4 bias_f;\n\
-    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f = read_imagef(bias, coord.xwww);\n\
-        vxc_half8 in_h, scale_h;\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        vxc_float4 in_f, scale_f;\n\
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        vxc_float4 sub, norm;\n\
-        sub = in_f - mean;\n\
-        norm = scale_f * vari * sub + bias_f;\n\
-        half4 norm_h;\n\
-        _viv_asm(CONV, norm_h, norm);\n\
-        vxc_half8 dst;\n\
-        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniExtractHalf4_dp4x4);\n\
-        vxc_short8 dstval;\n\
-        _viv_asm(COPY, dstval, dst, 16);\n\
-        VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-/*****************************layernorm uint8 to uint8****************************/\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform int output_ZP;\n\
-_viv_uniform int sumInZp;\n\
-_viv_uniform int tmpZp1;\n\
-_viv_uniform int tmpZp2;\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-__kernel void vxcLayerNorm_u8(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t output,\n\
-              float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
-    vxc_uchar16 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float sum = 0, sqr = 0;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    vxc_int4 tmpSum1;\n\
-    vxc_int4 tmpSqr1;\n\
-    short zp = inputZP;\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-        tmpSum += (tmpSum1.x);\n\
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
-    }\n\
-    sum = (tmpSum + sumInZp) * input_scale;\n\
-    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
-\n\
-    float mean, vari;\n\
-    mean = sum * dimRatio;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-    int4 coord_bias = (int4)(0, 0, 0, 0);\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        coord_bias.x = coord.x;\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert3rdUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert4thUint8SubZpToFp32_4x4);\n\
-        tmpData0 *= input_scale;\n\
-        tmpData1 *= input_scale;\n\
-        tmpData2 *= input_scale;\n\
-        tmpData3 *= input_scale;\n\
-\n\
-        vxc_float4 norm;\n\
-        tmpData0 -= mean;\n\
-        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        coord_bias.x += 4;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);\n\
-\n\
-        tmpData1 -= mean;\n\
-        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);\n\
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-\n\
-        tmpData2 -= mean;\n\
-        norm = scale_f0 * vari * tmpData2 + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);\n\
-\n\
-        tmpData3 -= mean;\n\
-        norm = scale_f1 * vari * tmpData3 + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);\n\
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-/***************************layernorm float16 to uint8**************************/\n\
-_viv_uniform float outputZP;\n\
-__kernel void vxcLayerNormFP16toU8(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t output,\n\
-              float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
-    vxc_short8 src0, src1;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
-    {\n\
-        vxc_half8  val0_h;\n\
-        _viv_asm(COPY, val0_h, src0, 16);\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniFp16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr += sumsqr.y;\n\
-    }\n\
-    vxc_float mean;\n\
-    mean = sum * dimRatio;\n\
-    vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_float4 bias_f;\n\
-    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f = read_imagef(bias, coord.xwww);\n\
-        vxc_half8 in_h, scale_h;\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        vxc_float4 in_f, scale_f;\n\
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        vxc_float4 sub, norm;\n\
-        sub = in_f - mean;\n\
-        norm = scale_f * vari * sub + bias_f;\n\
-        norm = norm * outputScale + outputZP;\n\
-        int4 output_int4;\n\
-        output_int4 = convert_int4_rte(norm);\n\
-        vxc_uchar8 dst;\n\
-        VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\
-            uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of vsi_nn_kernel_layernormalize_vx*/
-
-static const char vsi_nn_kernel_layernormalize_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-/*****************************layernorm uint8 to fp16****************************/\n\
-_viv_uniform int width;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
-_viv_uniform int sumInZp;\n\
-_viv_uniform int tmpZp1;\n\
-_viv_uniform int tmpZp2;\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
-_viv_uniform VXC_512Bits UniPackFP16even_2x8;\n\
-\n\
-__kernel void vxcLayerNormU8toFp16(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t output,\n\
-              float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
-    vxc_uchar16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    vxc_int4 tmpSum1;\n\
-    vxc_int4 tmpSqr1;\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-        tmpSum += (tmpSum1.x);\n\
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
-    }\n\
-    sum = (tmpSum + sumInZp) * input_scale;\n\
-    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
-\n\
-    float mean, vari;\n\
-    mean = sum * dimRatio;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-    int4 coord_bias = (int4)(0, 0, 0, 0);\n\
-    vxc_half8 scale_h;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_short8 src1, outval;\n\
-    short zp = inputZP;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    vxc_half8 dst;\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        coord_bias.x = coord.x;\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert3rdUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert4thUint8SubZpToFp32_4x4);\n\
-        tmpData0 *= input_scale;\n\
-        tmpData1 *= input_scale;\n\
-        tmpData2 *= input_scale;\n\
-        tmpData3 *= input_scale;\n\
-\n\
-        vxc_float4 norm;\n\
-        tmpData0 -= mean;\n\
-        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        coord_bias.x += 4;\n\
-        _viv_asm(CONV, tmpVal0, norm);\n\
-\n\
-        tmpData1 -= mean;\n\
-        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-        _viv_asm(CONV, tmpVal1, norm);\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniPackFP16even_2x8);\n\
-        _viv_asm(COPY, outval, dst, 16);\n\
-        int2 coord_out = (int2)(coord.x, coord.y);\n\
-        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-        tmpData2 -= mean;\n\
-        norm = scale_f0 * vari * tmpData2 + bias_f0;\n\
-        _viv_asm(CONV, tmpVal0, norm);\n\
-\n\
-        tmpData3 -= mean;\n\
-        norm = scale_f1 * vari * tmpData3 + bias_f1;\n\
-        _viv_asm(CONV, tmpVal1, norm);\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniPackFP16even_2x8);\n\
-        _viv_asm(COPY, outval, dst, 16);\n\
-        coord_out.x += 8;\n\
-        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-"; /* end of vsi_nn_kernel_layernormalize_U8_vx*/
-
-static const char vsi_nn_kernel_resize_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-//--------------------------resize-------------------------\n\
-_viv_uniform VXC_512Bits uniPackEvenData_2x8;\n\
-__kernel void resize_16bits_downsample_quarter\n\
-    (\n\
-    __read_only image2d_array_t input,\n\
-    __write_only image2d_array_t output\n\
-    )\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-    vxc_short8 src0, src1;\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(8, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord = coord >> 1;\n\
-    VXC_DP2x8(src0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardInf, 0), uniPackEvenData_2x8);\n\
-    VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void resize_8bits_downsample_quarter\n\
-    (\n\
-    __read_only image2d_array_t input,\n\
-    __write_only image2d_array_t output\n\
-    )\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-    vxc_char16 src0;\n\
-    vxc_char8 dst;\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord = coord >> 1;\n\
-    dst  = src0.s02468ace;\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of vsi_nn_kernel_resize_vx*/
-
 static const char vsi_nn_kernel_roi_align_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 __kernel void vxcRoi_align(\n\
@@ -35501,193 +36343,6 @@ __kernel void vxcRoi_align(\n\
 }\n\
 "; /* end of vsi_nn_kernel_roi_align_vx*/
 
-static const char vsi_nn_kernel_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-//--------------------------scale-------------------------\n\
-_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\
-_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Lo_4x4;\n\
-_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Hi_4x4;\n\
-__kernel void scale_fp16\n\
-    (\n\
-    __read_only     image2d_array_t input,\n\
-    __read_only     image2d_array_t weights,\n\
-    __read_only     image2d_array_t biases,\n\
-    __write_only    image2d_array_t output\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
-    vxc_short8 vec0, vec1;\n\
-    vxc_half8  src0;\n\
-    vxc_half8  w0;\n\
-    vxc_float4 b0, b1;\n\
-    vxc_float4 dst0, dst1;\n\
-    VXC_ReadImage(vec0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src0, vec0, 16);\n\
-    VXC_ReadImage(vec1, weights, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, w0, vec1, 16);\n\
-\n\
-    coord.z = coord.x + 4;\n\
-\n\
-    b0 = read_imagef(biases, coord.xwww);\n\
-    b1 = read_imagef(biases, coord.zwww);\n\
-\n\
-    VXC_DP4x4(dst0, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniFp16MulFp16ToFp32_Lo_4x4);\n\
-    VXC_DP4x4(dst1, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniFp16MulFp16ToFp32_Hi_4x4);\n\
-    dst0 += b0;\n\
-    dst1 += b1;\n\
-\n\
-    half4 t0, t1;\n\
-\n\
-    _viv_asm(CONV, t0, dst0);\n\
-    _viv_asm(CONV, t1, dst1);\n\
-\n\
-    VXC_DP2x8(w0, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\
-    _viv_asm(COPY, vec0, w0, 16);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, vec0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of vsi_nn_kernel_scale_vx*/
-
-static const char vsi_nn_kernel_shufflechannel_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-/******************shuffle channel float16/int16********************/\n\
-_viv_uniform int group_column;\n\
-_viv_uniform float rgroup_column;\n\
-\n\
-__kernel void shuffleChannelVXC(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-    int group_number,\n\
-    int axis)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    vxc_short8 src0, src1, src2, src3;\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    int coordz = coord.z;\n\
-    int index_col = coordz * rgroup_column;\n\
-    int index_row = coordz - index_col * group_column;\n\
-    coord.z = index_row * group_number + index_col;\n\
-    VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord.y ++;\n\
-    VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord.y ++;\n\
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord.y ++;\n\
-    VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-/*****************shuffle channel int8/uint8****************************/\n\
-\n\
-__kernel void shuffleChannel8BitsVXC(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-    int group_number,\n\
-    int axis)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    vxc_char16 src0, src1, src2, src3;\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    int coordz = coord.z;\n\
-    int index_col = coordz * rgroup_column;\n\
-    int index_row = coordz - index_col * group_column;\n\
-    coord.z = index_row * group_number + index_col;\n\
-    VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-    coord.y ++;\n\
-    VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-    coord.y ++;\n\
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-    coord.y ++;\n\
-    VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of vsi_nn_kernel_shufflechannel_vx*/
-
-static const char vsi_nn_kernel_shufflechannel_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-/******************shuffle channel float16/int16********************/\n\
-_viv_uniform int group_column;\n\
-_viv_uniform float rgroup_column;\n\
-\n\
-__kernel void shuffleChannel16Bits_Axis1(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-    int group_number,\n\
-    int axis)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    int4 coord_out = coord;\n\
-    vxc_short8 src0, src1, src2, src3;\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x += 8;\n\
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x += 8;\n\
-    VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x += 8;\n\
-    VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    int coordy = coord.y;\n\
-    int index_col = coordy * rgroup_column;\n\
-    int index_row = coordy - index_col * group_column;\n\
-    coord_out.y = index_row * group_number + index_col;\n\
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.x += 8;\n\
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.x += 8;\n\
-    VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.x += 8;\n\
-    VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-/*****************shuffle channel int8/uint8****************************/\n\
-\n\
-__kernel void shuffleChannel8Bits_Axis1(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-    int group_number,\n\
-    int axis)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    int4 coord_out = coord;\n\
-    vxc_char16 src0, src1;\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x += 16;\n\
-    VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    int coordy = coord.y;\n\
-    int index_col = coordy * rgroup_column;\n\
-    int index_row = coordy - index_col * group_column;\n\
-    coord_out.y = index_row * group_number + index_col;\n\
-    VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-    coord_out.x += 16;\n\
-    VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of vsi_nn_kernel_shufflechannel_axis1_vx*/
-
 static const char vsi_nn_kernel_signalframe_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int input_width;\n\
@@ -35968,49 +36623,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void vxcSignalFrame_ten
 #endif\n\
 "; /* end of vsi_nn_kernel_signalframe_vx*/
 
-static const char vsi_nn_kernel_space2depth_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4;\n\
-_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4;\n\
-_viv_uniform int input_depth;\n\
-\n\
-__kernel void vxcReorg2_fp16_fp16_sx2_sy1\n\
-    (\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-    int stridex,\n\
-    int stridey\n\
-    )\n\
-{\n\
-    int gidx = get_global_id(0);\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-\n\
-    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
-    int4 coord_out = (int4)(gidx >> 1, gidy, 0, 0);\n\
-    int out_d0, out_d1;\n\
-    vxc_short8 imageData;\n\
-    vxc_short8 imgVal0, imgVal1;\n\
-    //int tmpw = gidz / input_depth; \\n\\\n\
-    //int tmpz = gidz % input_depth; \\n\\\n\
-\n\
-    VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_DP4x4(imgVal0, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
-        uniExtractEvenFp16Stride2_4x4);\n\
-    VXC_DP4x4(imgVal1, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
-        uniExtractOddFp16Stride2_4x4);\n\
-\n\
-    out_d0 = gidz * 2 * 1;\n\
-    out_d1 = out_d0 + 1;\n\
-\n\
-    coord_out.z = out_d0;\n\
-    VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-    coord_out.z = out_d1;\n\
-    VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of vsi_nn_kernel_space2depth_vx*/
-
 static const char vsi_nn_kernel_tensorstackconcat_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 /*******************tensorstackconcat 16BITs********************/\n\
@@ -39894,6 +40506,151 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
 }\n\
 "; /* end of l2normalizescale_axis1_cl*/
 
+static const char layer_normalization_cl[] = "\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_F32toF32(\n\
+    __read_only image2d_array_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __write_only image2d_array_t  output,\n\
+    float eps,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float e2InScale,\n\
+    float scale_inOut,\n\
+    float sumZpScale,\n\
+    float zp2ScaleE2,\n\
+    float sumZpScaleE2,\n\
+    int width,\n\
+    int height,\n\
+    float dim_ratio\n\
+    )\n\
+{\n\
+    int lidx = get_local_id(0);\n\
+    int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    float4 data, dst;\n\
+    float2 sumSqr = (float2)(0);\n\
+    float scale_vari, bias_val;\n\
+    __local float2 local_sum[16];\n\
+\n\
+    for(; coord.x < width;)\n\
+    {\n\
+        data = read_imagef(input, coord);\n\
+        coord.x += 16;\n\
+        sumSqr.x += data.x;\n\
+        sumSqr.y += data.x * data.x;\n\
+    }\n\
+    local_sum[lidx] = sumSqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    if(lidx == 0)\n\
+    {\n\
+        for(int i = 1; i < 16; i++)\n\
+        {\n\
+            sumSqr += local_sum[i];\n\
+        }\n\
+        local_sum[0] = sumSqr;\n\
+    }\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    sumSqr = local_sum[0] * dim_ratio;\n\
+    sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps;\n\
+    sumSqr.s1 = rsqrt(sumSqr.s1);\n\
+\n\
+    for(coord.x = lidx; coord.x < width;)\n\
+    {\n\
+        float4 gamma = read_imagef(scale, coord.xw);\n\
+        float4 beta  = read_imagef(bias, coord.xw);\n\
+        data = read_imagef(input, coord);\n\
+\n\
+        scale_vari = gamma.s0 * sumSqr.s1;\n\
+        bias_val = (beta.s0 - scale_vari * sumSqr.s0);\n\
+\n\
+        dst.x = data.x * scale_vari + bias_val;\n\
+        write_imagef(output, coord, dst);\n\
+        coord.x += 16;\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_U8toU8(\n\
+    __read_only image2d_array_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __write_only image2d_array_t  output,\n\
+    float eps,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float e2InScale,\n\
+    float scale_inOut,\n\
+    float sumZpScale,\n\
+    float zp2ScaleE2,\n\
+    float sumZpScaleE2,\n\
+    int width,\n\
+    int height,\n\
+    float dim_ratio\n\
+    )\n\
+{\n\
+    int lidx = get_local_id(0);\n\
+    int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    uint4 data, dst;\n\
+    float2 sumSqr;\n\
+    uint tmpSum = 0, tmpSqr = 0;\n\
+    float scale_vari, bias_val;\n\
+    __local uint local_sum[1];\n\
+    __local uint local_sqr[1];\n\
+\n\
+    if(lidx == 0)\n\
+    {\n\
+        local_sum[0] = 0;\n\
+        local_sqr[0] = 0;\n\
+    }\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    for(; coord.x < width;)\n\
+    {\n\
+        data = read_imageui(input, coord);\n\
+        coord.x+=16;\n\
+        tmpSum += data.x;\n\
+        tmpSqr += data.x * data.x;\n\
+    }\n\
+    atom_add(local_sum, tmpSum);\n\
+    atom_add(local_sqr, tmpSqr);\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    tmpSum = local_sum[0];\n\
+    tmpSqr = local_sqr[0];\n\
+    //sumSqr.x = ((float)tmpSum - width * input_zp) * input_scale;\n\
+    //sumSqr.y = ((float)tmpSqr - 2 * input_zp * (float)tmpSum + width * input_zp * input_zp) * e2InScale;\n\
+    sumSqr.x = (float)tmpSum * input_scale - sumZpScale;\n\
+    sumSqr.y = (float)tmpSqr * e2InScale - zp2ScaleE2 * (float)tmpSum + sumZpScaleE2;\n\
+\n\
+    sumSqr *= dim_ratio;\n\
+    sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps;\n\
+    sumSqr.s1 = rsqrt(sumSqr.s1);\n\
+\n\
+    for(coord.x = lidx; coord.x < width;)\n\
+    {\n\
+        float4 gamma = read_imagef(scale, coord.xw);\n\
+        float4 beta  = read_imagef(bias, coord.xw);\n\
+        data = read_imageui(input, coord);\n\
+\n\
+        scale_vari = gamma.s0 * sumSqr.s1;\n\
+        float alpha = scale_inOut * scale_vari;\n\
+        bias_val = (beta.s0 - scale_vari * sumSqr.s0) * output_scale + output_zp;\n\
+\n\
+        float tmpVal = data.x - input_zp;\n\
+\n\
+        float4 norm;\n\
+        norm.x = tmpVal * alpha + bias_val;\n\
+        dst = convert_uint4_rte(norm);\n\
+        write_imageui(output, coord, dst);\n\
+        coord.x+=16;\n\
+    }\n\
+}\n\
+"; /* end of layer_normalization_cl*/
+
 static const char log_softmax_axis0_cl[] = "#define rlogE    (0.693147182f)\n\
 float LOG(float x)\n\
 {\n\
@@ -43435,32 +44192,30 @@ static const char matrixmul_cl[] = "__kernel void gemm_F32F32toF32_2D(\n\
     int K,\n\
     int N,\n\
     int ac2zero,\n\
-    int bc2zero\n\
+    int bc2zero,\n\
+    float scale_a,\n\
+    float zp_a,\n\
+    float scale_b,\n\
+    float zp_b,\n\
+    float scale_out,\n\
+    float zp_out\n\
     )\n\
 {\n\
-    int gidx = get_global_id(0);\n\
-    int gidy = get_global_id(1);\n\
-\n\
-    int2 coord_a = (int2)(0, gidy);\n\
-    int2 coord_b = (int2)(gidx, 0);\n\
-\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
     float4 sum = (float4)(0);\n\
 \n\
-    for(; coord_a.x < K;)\n\
+    for(; coord.z < K;)\n\
     {\n\
         float4 tempA0;\n\
         float4 tempB0;\n\
 \n\
-        tempA0 = read_imagef(inputA, coord_a);\n\
-        tempB0 = read_imagef(inputB, coord_b);\n\
-        coord_a.x++;\n\
-        coord_b.y++;\n\
+        tempA0 = read_imagef(inputA, coord.zy);\n\
+        tempB0 = read_imagef(inputB, coord.xz);\n\
+        coord.z++;\n\
 \n\
-        sum += tempA0 * tempB0;\n\
+        sum = sum + tempA0 * tempB0;\n\
     }\n\
-\n\
-    coord_b.y = gidy;\n\
-    write_imagef(output, coord_b, sum);\n\
+    write_imagef(output, coord.xy, sum);\n\
 }\n\
 \n\
 __kernel void gemm_F32F32toF32_3D(\n\
@@ -43471,7 +44226,13 @@ __kernel void gemm_F32F32toF32_3D(\n\
     int K,\n\
     int N,\n\
     int ac2zero,\n\
-    int bc2zero\n\
+    int bc2zero,\n\
+    float scale_a,\n\
+    float zp_a,\n\
+    float scale_b,\n\
+    float zp_b,\n\
+    float scale_out,\n\
+    float zp_out\n\
     )\n\
 {\n\
     int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);\n\
@@ -43489,13 +44250,163 @@ __kernel void gemm_F32F32toF32_3D(\n\
         coord_a.x++;\n\
         coord_b.y++;\n\
 \n\
-        sum += tempA0 * tempB0;\n\
+        sum = sum + tempA0 * tempB0;\n\
     }\n\
 \n\
     coord_b.y = get_global_id(1);\n\
     coord_b.z = get_global_id(2);\n\
     write_imagef(output, coord_b, sum);\n\
 }\n\
+\n\
+__kernel void gemm_transb_F32F32toF32_2D(\n\
+    __read_only image2d_t   inputA,\n\
+    __read_only image2d_t   inputB,\n\
+    __write_only image2d_t  output,\n\
+    int M,\n\
+    int K,\n\
+    int N,\n\
+    int ac2zero,\n\
+    int bc2zero,\n\
+    float scale_a,\n\
+    float zp_a,\n\
+    float scale_b,\n\
+    float zp_b,\n\
+    float scale_out,\n\
+    float zp_out\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    for(; coord.z < K;)\n\
+    {\n\
+        float4 tempA0;\n\
+        float4 tempB0;\n\
+\n\
+        tempA0 = read_imagef(inputA, coord.zy);\n\
+        tempB0 = read_imagef(inputB, coord.zx);\n\
+        coord.z++;\n\
+\n\
+        sum = sum + tempA0 * tempB0;\n\
+    }\n\
+    write_imagef(output, coord.xy, sum);\n\
+}\n\
+\n\
+__kernel void gemm_transb_F32F32toF32_3D(\n\
+    __read_only image2d_array_t   inputA,\n\
+    __read_only image2d_array_t   inputB,\n\
+    __write_only image2d_array_t  output,\n\
+    int M,\n\
+    int K,\n\
+    int N,\n\
+    int ac2zero,\n\
+    int bc2zero,\n\
+    float scale_a,\n\
+    float zp_a,\n\
+    float scale_b,\n\
+    float zp_b,\n\
+    float scale_out,\n\
+    float zp_out\n\
+    )\n\
+{\n\
+    int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);\n\
+    int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);\n\
+\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    for(; coord_a.x < K;)\n\
+    {\n\
+        float4 tempA0;\n\
+        float4 tempB0;\n\
+\n\
+        tempA0 = read_imagef(inputA, coord_a);\n\
+        tempB0 = read_imagef(inputB, coord_b);\n\
+        coord_a.x++;\n\
+        coord_b.x++;\n\
+\n\
+        sum = sum + tempA0 * tempB0;\n\
+    }\n\
+\n\
+    coord_a.x = get_global_id(0);\n\
+    coord_a.z = get_global_id(2);\n\
+    write_imagef(output, coord_b, sum);\n\
+}\n\
+\n\
+__kernel void gemm_transb_F32I8toF32_2D(\n\
+    __read_only image2d_t   inputA,\n\
+    __read_only image2d_t   inputB,\n\
+    __write_only image2d_t  output,\n\
+    int M,\n\
+    int K,\n\
+    int N,\n\
+    int ac2zero,\n\
+    int bc2zero,\n\
+    float scale_a,\n\
+    float zp_a,\n\
+    float scale_b,\n\
+    float zp_b,\n\
+    float scale_out,\n\
+    float zp_out\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    float4 sum = (float4)(0);\n\
+    for(; coord.z < K;)\n\
+    {\n\
+        float4 tempA0;\n\
+        float4 tempB0;\n\
+\n\
+        tempA0 = read_imagef(inputA, coord.zy);\n\
+        tempB0 = convert_float4(read_imagei(inputB, coord.zx));\n\
+        coord.z++;\n\
+        tempB0.x = (tempB0.x - zp_b) * scale_b;\n\
+\n\
+        sum = sum + tempA0 * tempB0;\n\
+    }\n\
+\n\
+    write_imagef(output, coord.xy, sum);\n\
+}\n\
+\n\
+__kernel void gemm_transb_F32I8toF32_3D(\n\
+    __read_only image2d_array_t   inputA,\n\
+    __read_only image2d_array_t   inputB,\n\
+    __write_only image2d_array_t  output,\n\
+    int M,\n\
+    int K,\n\
+    int N,\n\
+    int ac2zero,\n\
+    int bc2zero,\n\
+    float scale_a,\n\
+    float zp_a,\n\
+    float scale_b,\n\
+    float zp_b,\n\
+    float scale_out,\n\
+    float zp_out\n\
+    )\n\
+{\n\
+    int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);\n\
+    int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);\n\
+\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    for(; coord_a.x < K;)\n\
+    {\n\
+        float4 tempA0;\n\
+        float4 tempB0;\n\
+\n\
+        tempA0 = read_imagef(inputA, coord_a);\n\
+        tempB0 = convert_float4(read_imagei(inputB, coord_b));\n\
+        tempB0.x = (tempB0.x - zp_b) * scale_b;\n\
+        coord_a.x++;\n\
+        coord_b.x++;\n\
+\n\
+        sum = sum + tempA0 * tempB0;\n\
+    }\n\
+\n\
+    coord_a.x = get_global_id(0);\n\
+    coord_a.z = get_global_id(2);\n\
+    write_imagef(output, coord_b, sum);\n\
+}\n\
 "; /* end of matrixmul_cl*/
 
 static const char matrixmul_transA_cl[] = "__kernel void gemm_transa_F32F32toF32_2D(\n\
@@ -43506,32 +44417,30 @@ static const char matrixmul_transA_cl[] = "__kernel void gemm_transa_F32F32toF32
     int K,\n\
     int N,\n\
     int ac2zero,\n\
-    int bc2zero\n\
+    int bc2zero,\n\
+    float scale_a,\n\
+    float zp_a,\n\
+    float scale_b,\n\
+    float zp_b,\n\
+    float scale_out,\n\
+    float zp_out\n\
     )\n\
 {\n\
-    int gidx = get_global_id(0);\n\
-    int gidy = get_global_id(1);\n\
-\n\
-    int2 coord_a = (int2)(gidy, 0);\n\
-    int2 coord_b = (int2)(gidx, 0);\n\
-\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
     float4 sum = (float4)(0);\n\
 \n\
-    for(; coord_a.y < K;)\n\
+    for(; coord.z < K;)\n\
     {\n\
         float4 tempA0;\n\
         float4 tempB0;\n\
 \n\
-        tempA0 = read_imagef(inputA, coord_a);\n\
-        tempB0 = read_imagef(inputB, coord_b);\n\
-        coord_a.y++;\n\
-        coord_b.y++;\n\
+        tempA0 = read_imagef(inputA, coord.yz);\n\
+        tempB0 = read_imagef(inputB, coord.xz);\n\
+        coord.z++;\n\
 \n\
-        sum += tempA0 * tempB0;\n\
+        sum = sum + tempA0 * tempB0;\n\
     }\n\
-\n\
-    coord_b.y = gidy;\n\
-    write_imagef(output, coord_b, sum);\n\
+    write_imagef(output, coord.xy, sum);\n\
 }\n\
 \n\
 __kernel void gemm_transa_F32F32toF32_3D(\n\
@@ -43542,7 +44451,13 @@ __kernel void gemm_transa_F32F32toF32_3D(\n\
     int K,\n\
     int N,\n\
     int ac2zero,\n\
-    int bc2zero\n\
+    int bc2zero,\n\
+    float scale_a,\n\
+    float zp_a,\n\
+    float scale_b,\n\
+    float zp_b,\n\
+    float scale_out,\n\
+    float zp_out\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);\n\
@@ -43563,7 +44478,7 @@ __kernel void gemm_transa_F32F32toF32_3D(\n\
         coord_a.y++;\n\
         coord_b.y++;\n\
 \n\
-        sum += tempA0 * tempB0;\n\
+        sum = sum + tempA0 * tempB0;\n\
     }\n\
 \n\
     coord_b.y = gidy;\n\
@@ -47186,6 +48101,115 @@ __kernel void resize_nearest_U8toU8(\n\
 }\n\
 "; /* end of resize_nearest_cl*/
 
+static const char roi_align_cl[] = "inline float roi_align_1x1\n\
+(\n\
+    __read_only  image2d_array_t  input,\n\
+                           float2 region_start,\n\
+                           float2 region_end,\n\
+                           float2 bin_size,\n\
+                           int2   grid_size,\n\
+                           float2 rcp_of_grid_size,\n\
+                           int    pz\n\
+)\n\
+{\n\
+    float sum = 0;\n\
+\n\
+    for(int iy = 0; iy < grid_size.y; ++iy)\n\
+    {\n\
+        for(int ix = 0; ix < grid_size.x; ++ix)\n\
+        {\n\
+            float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);\n\
+            float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;\n\
+\n\
+            int2 xy_low  = convert_int2(pos);\n\
+            int2 xy_high = xy_low + 1;\n\
+\n\
+            float ly = pos.y - xy_low.y;\n\
+            float lx = pos.x - xy_low.x;\n\
+            float hy = 1.0f - ly;\n\
+            float hx = 1.0f - lx;\n\
+\n\
+            float w1 = hy * hx;\n\
+            float w2 = hy * lx;\n\
+            float w3 = ly * hx;\n\
+            float w4 = ly * lx;\n\
+\n\
+            float data1 = read_imagef(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;\n\
+            float data2 = read_imagef(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;\n\
+            float data3 = read_imagef(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;\n\
+            float data4 = read_imagef(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;\n\
+\n\
+            sum = sum + w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;\n\
+        }\n\
+    }\n\
+\n\
+    return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);\n\
+}\n\
+\n\
+\n\
+#define EPS_GRID 0.00001f\n\
+__kernel void roi_align_F32toF32\n\
+(\n\
+    __read_only  image2d_array_t  input,\n\
+    __read_only  image2d_t        rois,\n\
+    __read_only  image2d_t        n_rois,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  spatial_x_scale,\n\
+                           float  spatial_y_scale,\n\
+                           float  in_width,\n\
+                           float  in_height,\n\
+                           float  rcp_of_out_width,\n\
+                           float  rcp_of_out_height,\n\
+                           float  sampling_x_ratio,\n\
+                           float  sampling_y_ratio,\n\
+                           int    depth\n\
+)\n\
+{\n\
+    int px = get_global_id(0);\n\
+    int py = get_global_id(1);\n\
+    int pw = get_global_id(2);\n\
+\n\
+    int roi_batch = read_imagei(n_rois, (int2)(pw, 0)).x;\n\
+    float4 roi_x = read_imagef(rois, (int2)(0, pw));\n\
+    float4 roi_y = read_imagef(rois, (int2)(1, pw));\n\
+    float4 roi_z = read_imagef(rois, (int2)(2, pw));\n\
+    float4 roi_w = read_imagef(rois, (int2)(3, pw));\n\
+    float4 roi = (float4)(roi_x.x, roi_y.x, roi_z.x, roi_w.x);\n\
+\n\
+    float4 roi_anchor = roi * (float4)(spatial_x_scale, spatial_y_scale, spatial_x_scale, spatial_y_scale);\n\
+    float2 roi_dims = fmax(roi_anchor.zw - roi_anchor.xy, 1.0f);\n\
+\n\
+    float2 spatial_indx     = (float2)(px, py);\n\
+    float2 pooled_dims      = (float2)(rcp_of_out_width, rcp_of_out_height);\n\
+    float2 max_spatial_dims = (float2)(in_width, in_height);\n\
+\n\
+    float2 bin_size     = roi_dims * pooled_dims;\n\
+    float2 region_start = spatial_indx * bin_size + roi_anchor.xy;\n\
+    float2 region_end   = region_start + bin_size;\n\
+\n\
+    float2 roi_bin_grid = (float2)(sampling_x_ratio, sampling_y_ratio);\n\
+\n\
+    roi_bin_grid = roi_bin_grid == 0 ? ceil(bin_size - EPS_GRID) : roi_bin_grid;\n\
+\n\
+    int kz = roi_batch * depth;\n\
+    float2 rcp_of_grid_size = 1.0f / roi_bin_grid;\n\
+    int2 grid_size_xy = convert_int2(roi_bin_grid);\n\
+    float4 interp;\n\
+    int kz1 = pw * depth;\n\
+    for (int pz = 0; pz < depth; pz ++, kz ++, kz1 ++)\n\
+    {\n\
+        interp.x = roi_align_1x1( input,\n\
+                       region_start,\n\
+                       region_end,\n\
+                       bin_size,\n\
+                       grid_size_xy,\n\
+                       rcp_of_grid_size,\n\
+                       kz);\n\
+\n\
+        write_imagef(output, (int4)(px, py, kz1, 0), interp);\n\
+    }\n\
+}"; /* end of roi_align_cl*/
+
 static const char scatter_nd_cl[] = "__kernel void scatter_nd_U32toU32_1D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
@@ -47548,6 +48572,98 @@ __kernel void select_I8_F32_F32toF32_2D(\n\
 }\n\
 "; /* end of select_cl*/
 
+static const char space2depth_internal_cl[] = "\n\
+__kernel void space2depth_internal_F32toF32 (\n\
+        image2d_array_t    input,\n\
+        image2d_array_t    output,\n\
+        int block_size_x, int block_size_y,\n\
+        float  scaleInOut, float zpInOut)\n\
+{\n\
+    int x = get_global_id(0);\n\
+    int y = get_global_id(1);\n\
+    int z = get_global_id(2);\n\
+    int inDepth = get_image_array_size(input);\n\
+\n\
+    int4 coord = (int4)(x, y, z, 0);\n\
+    float4 data = {0.0};\n\
+    data = read_imagef(input, coord);\n\
+\n\
+    ushort blockSize_x = convert_ushort(block_size_x);\n\
+    ushort blockSize_y = convert_ushort(block_size_y);\n\
+    int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0);\n\
+    coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth\n\
+                     + z;\n\
+    write_imagef(output, coord_out, data);\n\
+}\n\
+\n\
+__kernel void space2depth_internal_F32toF32_X2Y1 (\n\
+        image2d_array_t    input,\n\
+        image2d_array_t    output,\n\
+        int block_size_x, int block_size_y,\n\
+        float  scaleInOut, float zpInOut)\n\
+{\n\
+    int x = get_global_id(0);\n\
+    int y = get_global_id(1);\n\
+    int z = get_global_id(2);\n\
+    int inDepth = get_image_array_size(input);\n\
+\n\
+    int4 coord = (int4)(x, y, z, 0);\n\
+    float4 data = {0.0};\n\
+    data = read_imagef(input, coord);\n\
+\n\
+    int4 coord_out = (int4)(x >> 1, y, 0, 0);\n\
+    coord_out.z = (x & 1) * inDepth + z;\n\
+    write_imagef(output, coord_out, data);\n\
+}\n\
+\n\
+__kernel void space2depth_internal_U8toU8 (\n\
+        image2d_array_t    input,\n\
+        image2d_array_t    output,\n\
+        int block_size_x, int block_size_y,\n\
+        float  scaleInOut, float zpInOut)\n\
+{\n\
+    int x = get_global_id(0);\n\
+    int y = get_global_id(1);\n\
+    int z = get_global_id(2);\n\
+    int inDepth = get_image_array_size(input);\n\
+\n\
+    int4 coord = (int4)(x, y, z, 0);\n\
+    uint4 data = {0};\n\
+    data = read_imageui(input, coord);\n\
+\n\
+    ushort blockSize_x = convert_ushort(block_size_x);\n\
+    ushort blockSize_y = convert_ushort(block_size_y);\n\
+    int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0);\n\
+    coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth\n\
+                    + z;\n\
+\n\
+    data.x = convert_uint(data.x * scaleInOut + zpInOut);\n\
+    write_imageui(output, coord_out, data);\n\
+}\n\
+\n\
+__kernel void space2depth_internal_U8toU8_X2Y1 (\n\
+        image2d_array_t    input,\n\
+        image2d_array_t    output,\n\
+        int block_size_x, int block_size_y,\n\
+        float  scaleInOut, float zpInOut)\n\
+{\n\
+    int x = get_global_id(0);\n\
+    int y = get_global_id(1);\n\
+    int z = get_global_id(2);\n\
+    int inDepth = get_image_array_size(input);\n\
+\n\
+    int4 coord = (int4)(x, y, z, 0);\n\
+    uint4 data = {0};\n\
+    data = read_imageui(input, coord);\n\
+\n\
+    int4 coord_out = (int4)(x >> 1, y, 0, 0);\n\
+    coord_out.z = (x & 1) * inDepth + z;\n\
+\n\
+    data.x = convert_uint(data.x * scaleInOut + zpInOut);\n\
+    write_imageui(output, coord_out, data);\n\
+}\n\
+"; /* end of space2depth_internal_cl*/
+
 static const char swish_cl[] = "float sigmoid_(float x, float logE)\n\
 {\n\
     x *= -logE;\n\
@@ -48019,6 +49135,13 @@ static const source_map_t evis_resource[] =
     {"instance_normalization_u8_vx", instance_normalization_u8_vx},
     {"l2normalizescale_axis0_vx", l2normalizescale_axis0_vx},
     {"l2normalizescale_axis1_vx", l2normalizescale_axis1_vx},
+    {"layer_normalization_vx", layer_normalization_vx},
+    {"layer_normalization_2d_vx", layer_normalization_2d_vx},
+    {"layer_normalization_i16_vx", layer_normalization_i16_vx},
+    {"layer_normalization_u8_f16_vx", layer_normalization_u8_f16_vx},
+    {"layer_normalization_wh_f16_vx", layer_normalization_wh_f16_vx},
+    {"layer_normalization_wh_i16_vx", layer_normalization_wh_i16_vx},
+    {"layer_normalization_wh_u8_vx", layer_normalization_wh_u8_vx},
     {"log_softmax_axis0_vx", log_softmax_axis0_vx},
     {"log_softmax_axis0_BF16_vx", log_softmax_axis0_BF16_vx},
     {"log_softmax_axis1_vx", log_softmax_axis1_vx},
@@ -48082,27 +49205,21 @@ static const source_map_t evis_resource[] =
     {"pow_i8_vx", pow_i8_vx},
     {"pow_u8_vx", pow_u8_vx},
     {"pre_process_bgra_vx", pre_process_bgra_vx},
-    {"pre_process_bgra_trans_vx", pre_process_bgra_trans_vx},
     {"pre_process_gray_vx", pre_process_gray_vx},
     {"pre_process_gray_copy_vx", pre_process_gray_copy_vx},
     {"pre_process_nv12_scale_vx", pre_process_nv12_scale_vx},
     {"pre_process_nv12_scale_8bits_vx", pre_process_nv12_scale_8bits_vx},
     {"pre_process_nv12_scale_mix_vx", pre_process_nv12_scale_mix_vx},
-    {"pre_process_nv12_trans_u8_vx", pre_process_nv12_trans_u8_vx},
     {"pre_process_rgb_vx", pre_process_rgb_vx},
     {"pre_process_rgb_copy_vx", pre_process_rgb_copy_vx},
-    {"pre_process_rgb_copy_trans_vx", pre_process_rgb_copy_trans_vx},
-    {"pre_process_rgb_trans_vx", pre_process_rgb_trans_vx},
     {"pre_process_yuv420_copy_u8_vx", pre_process_yuv420_copy_u8_vx},
     {"pre_process_yuv420_scale_fp16_vx", pre_process_yuv420_scale_fp16_vx},
     {"pre_process_yuv420_scale_i16_vx", pre_process_yuv420_scale_i16_vx},
     {"pre_process_yuv420_scale_i8_vx", pre_process_yuv420_scale_i8_vx},
     {"pre_process_yuv420_scale_u8_vx", pre_process_yuv420_scale_u8_vx},
-    {"pre_process_yuv420_trans_u8_vx", pre_process_yuv420_trans_u8_vx},
     {"pre_process_yuv444_copy_u8_vx", pre_process_yuv444_copy_u8_vx},
     {"pre_process_yuv444_scale_vx", pre_process_yuv444_scale_vx},
     {"pre_process_yuv444_scale_fp16_vx", pre_process_yuv444_scale_fp16_vx},
-    {"pre_process_yuv444_trans_u8_vx", pre_process_yuv444_trans_u8_vx},
     {"prelu_vx", prelu_vx},
     {"prelu_BF16_vx", prelu_BF16_vx},
     {"random_multinomial_vx", random_multinomial_vx},
@@ -48144,6 +49261,7 @@ static const source_map_t evis_resource[] =
     {"scatter_nd_vx", scatter_nd_vx},
     {"scatter_nd_big_vx", scatter_nd_big_vx},
     {"select_vx", select_vx},
+    {"space2depth_internal_vx", space2depth_internal_vx},
     {"swish_vx", swish_vx},
     {"tile_vx", tile_vx},
     {"tile_mix_vx", tile_mix_vx},
@@ -48151,12 +49269,12 @@ static const source_map_t evis_resource[] =
     {"upsample_I16_vx", upsample_I16_vx},
     {"upsample_I8_vx", upsample_I8_vx},
     {"upsample_U8_vx", upsample_U8_vx},
+    {"upsamplescale_vx", upsamplescale_vx},
+    {"upsamplescale_k2_vx", upsamplescale_k2_vx},
     {"vsi_nn_kernel_axis_aligned_bbox_transform_vx", vsi_nn_kernel_axis_aligned_bbox_transform_vx},
     {"vsi_nn_kernel_box_with_nms_limit_vx", vsi_nn_kernel_box_with_nms_limit_vx},
-    {"vsi_nn_kernel_crop_vx", vsi_nn_kernel_crop_vx},
     {"vsi_nn_kernel_detection_postprocess_vx", vsi_nn_kernel_detection_postprocess_vx},
     {"vsi_nn_kernel_extra_ending_vx", vsi_nn_kernel_extra_ending_vx},
-    {"vsi_nn_kernel_fullconnect2_vx", vsi_nn_kernel_fullconnect2_vx},
     {"vsi_nn_kernel_generate_proposals_vx", vsi_nn_kernel_generate_proposals_vx},
     {"vsi_nn_kernel_header_vx", vsi_nn_kernel_header_vx},
     {"vsi_nn_kernel_heatmap_max_keypoint_vx", vsi_nn_kernel_heatmap_max_keypoint_vx},
@@ -48165,15 +49283,8 @@ static const source_map_t evis_resource[] =
     {"vsi_nn_kernel_imageprocess_3_vx", vsi_nn_kernel_imageprocess_3_vx},
     {"vsi_nn_kernel_imageprocess_4_vx", vsi_nn_kernel_imageprocess_4_vx},
     {"vsi_nn_kernel_imageprocess_5_vx", vsi_nn_kernel_imageprocess_5_vx},
-    {"vsi_nn_kernel_layernormalize_vx", vsi_nn_kernel_layernormalize_vx},
-    {"vsi_nn_kernel_layernormalize_U8_vx", vsi_nn_kernel_layernormalize_U8_vx},
-    {"vsi_nn_kernel_resize_vx", vsi_nn_kernel_resize_vx},
     {"vsi_nn_kernel_roi_align_vx", vsi_nn_kernel_roi_align_vx},
-    {"vsi_nn_kernel_scale_vx", vsi_nn_kernel_scale_vx},
-    {"vsi_nn_kernel_shufflechannel_vx", vsi_nn_kernel_shufflechannel_vx},
-    {"vsi_nn_kernel_shufflechannel_axis1_vx", vsi_nn_kernel_shufflechannel_axis1_vx},
     {"vsi_nn_kernel_signalframe_vx", vsi_nn_kernel_signalframe_vx},
-    {"vsi_nn_kernel_space2depth_vx", vsi_nn_kernel_space2depth_vx},
     {"vsi_nn_kernel_tensorstackconcat_vx", vsi_nn_kernel_tensorstackconcat_vx},
     {"vsi_nn_kernel_topk_vx", vsi_nn_kernel_topk_vx},
     {"vsi_nn_kernel_transform_gemm_vx", vsi_nn_kernel_transform_gemm_vx},
@@ -48210,6 +49321,7 @@ static const source_map_t cl_resource[] =
     {"instance_normalization_u8_cl", instance_normalization_u8_cl},
     {"l2normalizescale_axis0_cl", l2normalizescale_axis0_cl},
     {"l2normalizescale_axis1_cl", l2normalizescale_axis1_cl},
+    {"layer_normalization_cl", layer_normalization_cl},
     {"log_softmax_axis0_cl", log_softmax_axis0_cl},
     {"log_softmax_axis1_cl", log_softmax_axis1_cl},
     {"log_softmax_axis2_cl", log_softmax_axis2_cl},
@@ -48271,8 +49383,10 @@ static const source_map_t cl_resource[] =
     {"resize_1d_nearest_cl", resize_1d_nearest_cl},
     {"resize_bilinear_cl", resize_bilinear_cl},
     {"resize_nearest_cl", resize_nearest_cl},
+    {"roi_align_cl", roi_align_cl},
     {"scatter_nd_cl", scatter_nd_cl},
     {"select_cl", select_cl},
+    {"space2depth_internal_cl", space2depth_internal_cl},
     {"swish_cl", swish_cl},
     {"tile_cl", tile_cl},
     {"upsample_cl", upsample_cl},
diff --git a/src/tim/vx/internal/src/makefile.linux b/src/tim/vx/internal/src/makefile.linux
index 86691ef..799a920 100644
--- a/src/tim/vx/internal/src/makefile.linux
+++ b/src/tim/vx/internal/src/makefile.linux
@@ -10,8 +10,11 @@ CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")
 
 ################################################################################
 # Supply necessary libraries.
-
-LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC
+ifeq ($(USE_VXC_BINARY)$(USE_VSC_LITE),11)
+LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC_Lite -lGAL
+else
+LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC -lGAL
+endif
 LIBS += -lm -ldl
 
 #############################################################################
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
index 8f115d9..0a260d2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
@@ -219,7 +219,10 @@ static vsi_bool op_check
         IO_TYPE(D_F32, D_I32)
         IO_TYPE(D_F16, D_I32)
         IO_TYPE(D_I32, D_I32)
-        IO_TYPE(D_U8|Q_ASYM, D_I32)
+        IO_TYPE(D_I8|Q_DFP,   D_I32)
+        IO_TYPE(D_I8,         D_I32)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32)
+        IO_TYPE(D_U8,         D_I32)
     END_IO_TYPE_DECL(ARGMIN)
     if(!VALIDATE_OP_IO_TYPES(ARGMIN, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
index 04757b5..cb1cda3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
@@ -44,190 +44,6 @@
 #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
 #define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
 
-#define USE_OVX_API TRUE
-
-#if (USE_OVX_API == FALSE)
-extern vx_kernel_description_t * vx_kernel_CROP_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_crop_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.crop);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, offset[0] );
-    _SET_PARAM( 1, VX_TYPE_INT32, offset[1] );
-    _SET_PARAM( 2, VX_TYPE_INT32, offset[2] );
-
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_pre_init
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_kernel_info_t * kernel_info
-    )
-{
-    vsi_nn_type_e dataFormat = inputs[0]->attr.dtype.vx_type;
-    vsi_nn_type_e dstFormat = outputs[0]->attr.dtype.vx_type;
-
-    if (dataFormat == VSI_NN_TYPE_FLOAT16
-        || (dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_INT16))
-    {
-        kernel_info->kernel_index = 1;
-    }
-    else if(dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        kernel_info->kernel_index = 3;
-    }
-    else
-    {
-        kernel_info->kernel_index = 2;
-    }
-
-    return VSI_SUCCESS;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_border_t border;
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    border.mode = VX_BORDER_REPLICATE;
-    border.constant_value.U32 = 0;
-    status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
-#endif
-
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -236,7 +52,6 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VSI_FAILURE;
-#if (USE_OVX_API == TRUE)
     vx_nn_stride_slice_params_t param;
     vsi_nn_tensor_t *begin_dims_tensor = NULL;
     vsi_nn_tensor_t *end_dims_tensor = NULL;
@@ -317,36 +132,6 @@ static vsi_status op_compute
     {
         status = VSI_SUCCESS;
     }
-#else
-    vsi_nn_kernel_info_t kernel_info;
-
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_crop";
-    kernel_info.type = vsi_nn_GetVXKernelTypeForShader();
-    kernel_info.kernel = vx_kernel_CROP_list;
-    kernel_info.init_index = 1;
-
-    if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type))
-    {
-        vx_op_pre_init(self, inputs, outputs, &kernel_info);
-    }
-
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name) free(kernel_info.resource_name);
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
-#endif
 OnError:
     if (begin_dims_tensor) vsi_nn_ReleaseTensor(&begin_dims_tensor);
     if (end_dims_tensor) vsi_nn_ReleaseTensor(&end_dims_tensor);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index ca6b3db..e86fe3d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -221,6 +221,9 @@ static vsi_bool op_check
         IO_TYPE(D_BF16,   D_F32)
         IO_TYPE(D_I32,   D_I32)
         IO_TYPE(D_I32,   D_I16|Q_DFP)
+        IO_TYPE(D_I16, D_I16|Q_DFP)
+        IO_TYPE(D_I8,  D_I8|Q_DFP)
+        IO_TYPE(D_U8,  D_U8|Q_ASYM)
     END_IO_TYPE_DECL(DATACONVERT)
     if (!VALIDATE_OP_IO_TYPES(DATACONVERT, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
index d86b715..4e33da4 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
@@ -196,6 +196,7 @@ static vsi_bool op_check
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP, D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,  D_I32|Q_DFP, D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC, D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC, D_I8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
index 39dbfe9..b942f2c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
@@ -89,7 +89,11 @@ static vsi_bool op_check
         IO_TYPE(D_I32, D_F16,  D_F16)
         IO_TYPE(D_I32, D_F32,  D_F32)
         IO_TYPE(D_I32, D_I32,  D_I32)
+        IO_TYPE(D_I32, D_U8|Q_ASYM,  D_F32)
         IO_TYPE(D_I32, D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I32, D_U8|Q_ASYM,  D_I8|Q_DFP)
+        IO_TYPE(D_I32, D_U8|Q_ASYM,  D_I8)
+        IO_TYPE(D_F16, D_F16,  D_F16)
     END_IO_TYPE_DECL(EMBEDDING_LOOKUP)
 
     if (!VALIDATE_OP_IO_TYPES(EMBEDDING_LOOKUP, self, inputs, self->input.num, outputs, self->output.num))
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c
index 7fca31a..d1af0cd 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c
@@ -42,215 +42,6 @@
 #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
 #define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
 
-#define USE_OVX_API TRUE
-
-#if (USE_OVX_API == FALSE)
-extern vx_kernel_description_t * vx_kernel_FCL2_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_fcl_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.fcl);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, axis );
-    //_SET_PARAM( 1, VX_TYPE_FLOAT32, bias );
-    //_SET_PARAM( 2, VX_TYPE_TENSOR, data_bias );
-    //_SET_PARAM( 3, VX_TYPE_TENSOR, data_weight );
-    //_SET_PARAM( 4, VX_TYPE_FLOAT32, regularize );
-    _SET_PARAM( 1, VX_TYPE_INT32, weights );
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    uint32_t axis;
-    vsi_nn_fcl_param * p;
-    uint32_t i = 0;
-    uint32_t num_fc = 1, num_no_fc = 1;
-    uint32_t       num_of_dims[3]  = {0};
-    uint32_t       input_size[VSI_NN_MAX_DIM_NUM]   = {0};
-    uint32_t       output_size[VSI_NN_MAX_DIM_NUM]  = {0};
-    uint32_t       weights_size[VSI_NN_MAX_DIM_NUM] = {0};
-    int32_t        size[VSI_NN_MAX_DIM_NUM]         = {0};
-    uint32_t       ofm             = 0;
-    uint32_t       dims            = 0;
-    vx_tensor       input           = NULL;
-    vx_tensor       output          = NULL;
-    vx_tensor       weight          = NULL;
-    vx_tensor       bias            = NULL;
-    int32_t index = 0;
-    vx_border_t border;
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-    p = (vsi_nn_fcl_param *)&(self->nn_param.fcl);
-    axis = p->axis;
-
-    memcpy(input_size, inputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM);
-    num_of_dims[0] = inputs[0]->attr.dim_num;
-    memcpy(output_size, outputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM);
-    num_of_dims[1] = outputs[0]->attr.dim_num;
-    memcpy(weights_size, inputs[1]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM);
-    num_of_dims[2] = inputs[1]->attr.dim_num;
-
-    ofm = weights_size[num_of_dims[2] - 1];
-
-    for(i = 0; i <= (uint32_t)axis; ++i)
-    {
-        num_fc *= input_size[i];
-    }
-    for(i = axis + 1; i < num_of_dims[0]; ++i)
-    {
-        num_no_fc *= input_size[i];
-    }
-
-    size[0] = num_fc;
-    size[1] = num_no_fc;
-    dims= 2;
-    input = vxReshapeTensor(inputs[0]->t, size, dims);
-
-    size[0] = num_fc;
-    size[1] = ofm;
-    dims= 2;
-    weight = vxReshapeTensor(inputs[1]->t, size, dims);
-
-    size[0] = ofm;
-    size[1] = 1;
-    dims= 2;
-    bias = vxReshapeTensor(inputs[2]->t, size, dims);
-
-    size[0] = ofm;
-    size[1] = num_no_fc;
-    dims= 2;
-    output = vxReshapeTensor(outputs[0]->t, size, dims);
-
-    status |= vxSetParameterByIndex(self->n, index++, (vx_reference)input);
-    status |= vxSetParameterByIndex(self->n, index++, (vx_reference)weight);
-    status |= vxSetParameterByIndex(self->n, index++, (vx_reference)bias);
-    status |= vxSetParameterByIndex(self->n, index++, (vx_reference)output);
-
-    border.mode = VX_BORDER_CONSTANT;
-    border.constant_value.S16 = 0;
-    status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
-
-    if (input)  vxReleaseTensor(&input);
-    if (weight) vxReleaseTensor(&weight);
-    if (bias)   vxReleaseTensor(&bias);
-    if (output) vxReleaseTensor(&output);
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
-#endif
-
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -259,7 +50,6 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VSI_FAILURE;
-#if (USE_OVX_API == TRUE)
     uint32_t axis;
     vsi_nn_fcl_param * p;
     uint32_t i = 0;
@@ -343,30 +133,7 @@ static vsi_status op_compute
     if (weight) vxReleaseTensor(&weight);
     if (bias)   vxReleaseTensor(&bias);
     if (output) vxReleaseTensor(&output);
-#else
-    vsi_nn_kernel_info_t kernel_info;
 
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_fullconnect2";
-    kernel_info.type = VX_KERNEL_TYPE_VX;
-    kernel_info.kernel = vx_kernel_FCL2_list;
-    kernel_info.kernel_index = 1;
-    kernel_info.init_index = 1;
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name) free(kernel_info.resource_name);
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
-#endif
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
index 4cc922e..81af2ce 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@@ -74,6 +74,7 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "block_size", block_size );
     vsi_nn_kernel_param_add_int32( param, "block_num", block_num );
     vsi_nn_kernel_param_add_int32( param, "axis_num", axis_num );
+    vsi_nn_kernel_param_add_int32( param, "axis", axis );
     vsi_nn_kernel_param_add_int32( param, "indices_num", indices_num );
     n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param );
     if( n != NULL )
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
index 46a23ce..2a0f6a2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
@@ -41,6 +41,50 @@
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
 
+static vsi_status _try_set_high_presision_tensor
+    (
+    vsi_nn_tensor_t **inputs
+    )
+{
+    vsi_status status;
+    vsi_nn_vxtensor_attr_t attr;
+
+    status = VSI_SUCCESS;
+    attr = VSI_NN_TENSOR_ATTR_HIGH_PRECISION;
+
+    if(VSI_NN_TYPE_FLOAT32 == inputs[1]->attr.dtype.vx_type)
+    {
+        status = vsi_nn_SetTensorAttr(inputs[1], attr);
+        if(VSI_SUCCESS != status)
+        {
+            return status;
+        }
+    }
+    if(VSI_NN_TYPE_FLOAT32 == inputs[2]->attr.dtype.vx_type)
+    {
+        status = vsi_nn_SetTensorAttr(inputs[2], attr);
+        if(VSI_SUCCESS != status)
+        {
+            return status;
+        }
+    }
+
+    return status;
+}
+
+static vsi_bool _is_3d_instance_norm
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs
+    )
+{
+    if( 3 == inputs[0]->attr.dim_num )
+    {
+        return TRUE;
+    }
+    return FALSE;
+} /* _is_3d_instance_norm() */
+
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -55,19 +99,42 @@ static vsi_status op_compute
     uint32_t *input_size = inputs[0]->attr.size;
     uint32_t dims_num = inputs[0]->attr.dim_num;
     int32_t rs_flg = 0;
+    vsi_nn_tensor_t * tmp_inputs[3]  = {NULL, NULL, NULL};
+    vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
+    vsi_nn_instancenorm_lcl_data2 *local = self->nn_param.instancenorm.lcl2_data;
 
-    param =vsi_nn_kernel_param_create();
-
-    if((input_size[1] * input_size[2] < 65536)
-        && dims_num > 2)
+    status = _try_set_high_presision_tensor(inputs);
+    if(status != VSI_SUCCESS)
     {
-        rs_flg = 1;
+        VSILOGE("Set tensor attr of high presision fail");
+        return status;
     }
 
+    if(_is_3d_instance_norm(self, inputs))
+    {
+        tmp_inputs[0]  = local->reshaped_input;
+        tmp_outputs[0] = local->reshaped_output;
+        tmp_inputs[1] = inputs[1];
+        tmp_inputs[2] = inputs[2];
+    }
+    else
+    {
+        tmp_inputs[0] = inputs[0];
+        tmp_outputs[0] = outputs[0];
+        tmp_inputs[1] = inputs[1];
+        tmp_inputs[2] = inputs[2];
+        if((input_size[1] * input_size[2] < 65536)
+            && dims_num > 2)
+        {
+            rs_flg = 1;
+        }
+    }
+
+    param =vsi_nn_kernel_param_create();
     vsi_nn_kernel_param_add_float32( param, "eps", eps );
     vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg );
     n = vsi_nn_kernel_selector( self->graph, "instance_norm",
-                    inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+                    tmp_inputs, _INPUT_NUM, tmp_outputs, _OUTPUT_NUM, param );
     if( n != NULL )
     {
         self->n = (vx_node)n;
@@ -82,6 +149,59 @@ static vsi_status op_compute
     return status;
 } /* op_compute() */
 
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    uint32_t dim = 0;
+    vsi_nn_instancenorm_lcl_data2 *local = NULL;
+    uint32_t shape[VSI_NN_MAX_DIM_NUM];
+    char tensor_name[128];
+
+    dim = inputs[0]->attr.dim_num;
+    if(_is_3d_instance_norm(self, inputs) == FALSE)
+    {
+        return VSI_SUCCESS;
+    }
+
+    VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
+    /*
+        insert a reshape node before and after 3D instance_norm
+    */
+    shape[0] = 1;
+    shape[1] = inputs[0]->attr.size[0];
+    shape[2] = inputs[0]->attr.size[1];
+    shape[3] = inputs[0]->attr.size[2];
+    dim = 4;
+    local = self->nn_param.instancenorm.lcl2_data;
+    if (VSI_NN_OPTIMIZE_FORWARD == direction)
+    {
+        /* reshape 3d input (xcn) --> 4d input (whcn) */
+        local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
+    }
+    else
+    {
+        /* reshape 3d output(xcn) --> 4d output(whcn) */
+        local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
+        if(local->reshaped_output && local->reshaped_output->t)
+        {
+            memset(tensor_name, 0, sizeof(tensor_name));
+            snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
+            if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
+            {
+                VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid);
+                return VSI_FAILURE;
+            }
+        }
+    }
+
+    return VSI_SUCCESS;
+} /* op_optimize() */
+
 static vsi_bool op_check
     (
     vsi_nn_node_t * self,
@@ -133,6 +253,8 @@ static vsi_status op_init
     self->nn_param.instancenorm.lcl2_data->reshapeFlg = 0;
     self->nn_param.instancenorm.lcl2_data->execute_on_sw = 0;
     self->nn_param.instancenorm.lcl2_data->hash_idx = 0;
+    self->nn_param.instancenorm.lcl2_data->reshaped_input = NULL;
+    self->nn_param.instancenorm.lcl2_data->reshaped_output = NULL;
 
     return status;
 } /* op_init() */
@@ -143,6 +265,7 @@ static vsi_status op_deinit
     )
 {
     uint32_t i;
+    vsi_nn_instancenormalize_param *p = &(self->nn_param.instancenorm);
     for (i = 0; i < _VSI_NN_INSTANCENORM_LOCAL_TENSOR_NUM; i++)
     {
         if (self->nn_param.instancenorm.local.local_tensor[i] != NULL)
@@ -151,6 +274,16 @@ static vsi_status op_deinit
             self->nn_param.instancenorm.local.local_tensor[i] = NULL;
         }
     }
+    if(p->lcl2_data->reshaped_input)
+    {
+        vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_input));
+        p->lcl2_data->reshaped_input = NULL;
+    }
+    if(p->lcl2_data->reshaped_output)
+    {
+        vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_output));
+        p->lcl2_data->reshaped_output = NULL;
+    }
     if(self->nn_param.instancenorm.lcl2_data)
     {
         free(self->nn_param.instancenorm.lcl2_data);
@@ -173,7 +306,7 @@ DEF_OP_REG
     /* deinit     */ op_deinit,
     /* check      */ op_check,
     /* setup      */ vsi_nn_op_common_setup,
-    /* optimize   */ NULL,
+    /* optimize   */ op_optimize,
     /* input_num  */ _INPUT_NUM,
     /* output_num */ _OUTPUT_NUM
     );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
index 52a54bb..04e5610 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
@@ -115,6 +115,45 @@ final:
 }
 
 
+static vsi_bool _check_value_is_equal_to_one
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor
+    )
+{
+    vsi_bool ret = TRUE;
+    float* tensor_data = NULL;
+    uint32_t elements = 0;
+    uint32_t i = 0;
+
+    elements = vsi_nn_GetElementNum( tensor );
+    tensor_data = vsi_nn_ConvertTensorToFloat32Data( graph, tensor );
+    if ( NULL == tensor_data )
+    {
+        VSILOGE( "Convert data fail." );
+        return FALSE;
+    }
+
+    for (i = 0; i < elements; i++)
+    {
+        if ( vsi_abs(tensor_data[i] - 1.0f) > 1e-5 )
+        {
+            ret = FALSE;
+            break;
+        }
+    }
+
+    if ( !tensor->attr.is_created_from_handle )
+    {
+        if ( tensor_data )
+        {
+            free(tensor_data);
+        }
+    }
+
+    return ret;
+}
+
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -141,6 +180,11 @@ static vsi_status op_compute
     p = &(self->nn_param.l2normalizescale);
     axis = p->axis;
 
+    if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one(self->graph, inputs[1]) )
+    {
+        return vsi_nn_internal_compute_node( self );
+    }
+
     param =vsi_nn_kernel_param_create();
 
     ret = vsi_nn_kernel_optimize_reduce_shape(
@@ -240,6 +284,9 @@ static vsi_status op_deinit
             self->nn_param.l2normalizescale.local.local_tensor[i] = NULL;
         }
     }
+
+    vsi_nn_internal_deinit_node_wksp( self );
+
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
@@ -253,11 +300,15 @@ static vsi_bool op_setup
     )
 {
     vsi_bool ret = TRUE;
+    vsi_nn_internal_node_t* curr = NULL;
+
     if( NULL == self )
     {
         return FALSE;
     }
 
+    vsi_nn_internal_init_node_wksp( self );
+
     if (self->nn_param.l2normalizescale.axis < 0)
     {
         self->nn_param.l2normalizescale.axis += (int32_t)inputs[0]->attr.dim_num;
@@ -269,6 +320,15 @@ static vsi_bool op_setup
         return FALSE;
     }
 
+    if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one( self->graph, inputs[1] ) )
+    {
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0);
+        curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis;
+        curr->inputs[0] = inputs[0];
+        curr->outputs[0] = outputs[0];
+        vsi_nn_internal_setup_node( self, curr );
+    }
+
     ret = vsi_nn_op_common_setup(self, inputs, outputs);
 
     return ret;
@@ -280,7 +340,7 @@ static vsi_status op_init
     )
 {
     vsi_status status = VSI_SUCCESS;
-    uint32_t  i;
+    uint32_t  i = 0;
 
     if (vsi_nn_compareVersion(self->graph, 1, 1, 13) == -1)
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
index 87f2b54..7cc8663 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@@ -35,312 +35,11 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 #include "client/vsi_nn_vxkernel.h"
+#include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
-#define _ARG_NUM            (1)
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
-
-extern vx_kernel_description_t * vx_kernel_LAYERNORM_list[];
-
-static void check_tensor_shape
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t * input,
-    vx_reference * params,
-    uint32_t index,
-    vx_bool rsFlg
-    )
-{
-    vsi_nn_tensor_attr_t attr;
-
-    if (index == 0 )
-    {
-        if(input->attr.dim_num == 1)
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.size[1] = 1;
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else if ((input->attr.dim_num == 3 && input->attr.size[2] == 1)
-            ||(input->attr.dim_num == 4 && input->attr.size[2] == 1 && input->attr.size[3] == 1))
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else
-            params[index] = (vx_reference)input->t;
-    }
-    else if(index == 1 )
-    {
-        if(input->attr.dim_num == 1)
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.size[1] = 1;
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else
-             params[index] = (vx_reference)input->t;
-
-    }
-    else if(index == 2)
-    {
-        if(input->attr.dim_num == 1)
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.size[1] = 1;
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else
-             params[index] = (vx_reference)input->t;
-    }
-    else if(index == 3)
-    {
-        if(input->attr.dim_num == 1)
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.size[1] = 1;
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else if ((input->attr.dim_num == 3 && input->attr.size[2] == 1)
-            ||(input->attr.dim_num == 4 && input->attr.size[2] == 1 && input->attr.size[3] == 1))
-        {
-            memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
-            attr.dim_num = 2;
-            self->nn_param.layernorm.local.local_tensor[index] =
-                vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
-            params[index] =  (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
-        }
-        else
-             params[index] = (vx_reference)input->t;
-    }
-    else
-    {
-        VSILOGE("No more local tensor!(LAYERNORM) at [%s : %d]\n", __FILE__, __LINE__);
-    }
-}
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_layernormalize_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.layernorm);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_FLOAT32, eps );
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_pre_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_kernel_info_t * kernel_info
-    )
-{
-    vsi_nn_type_e inputDataFormat     = inputs[0]->attr.dtype.vx_type;
-    vsi_nn_type_e outputDataFormat    = outputs[0]->attr.dtype.vx_type;
-    vsi_nn_type_e scaleDataFormat     = inputs[2]->attr.dtype.vx_type;
-    if (inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16
-        && scaleDataFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        kernel_info->kernel_index = 1;
-    }
-    else if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8
-        && scaleDataFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        kernel_info->kernel_index = 2;
-    }
-    else if (inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_UINT8
-        && scaleDataFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        kernel_info->kernel_index = 3;
-    }
-    else if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_FLOAT16
-        && scaleDataFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        kernel_info->resource_name[0] = "vsi_nn_kernel_layernormalize_U8";
-        kernel_info->kernel_index = 4;
-    }
-    else
-    {
-        VSILOGE("Not support input or output data format!(LAYERNORM) at [%s : %d]\n", __FILE__, __LINE__);
-        return VSI_FAILURE;
-    }
-    return VSI_SUCCESS;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_border_t border;
-    vx_reference * args;
-    vx_bool rsFlg = FALSE;
-    int32_t in_zp;
-    vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type;
-    vsi_nn_tensor_attr_t attr;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    //_set_inputs_outputs( params, inputs, outputs );
-    check_tensor_shape(self, inputs[0], params, 0, rsFlg);
-    check_tensor_shape(self, inputs[1], params, 1, rsFlg);
-    check_tensor_shape(self, inputs[2], params, 2, rsFlg);
-    check_tensor_shape(self, outputs[0], params, 3, rsFlg);
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    status  = vsi_nn_vxGetTensorAttr(inputs[0]->t,  &attr);
-    in_zp = attr.dtype.zero_point;
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    border.mode = VX_BORDER_CONSTANT;
-    border.constant_value.U32 = 0;
-    border.constant_value.S16 = 0;
-    border.constant_value.U8 = 0;
-    if(inputDataFormat == VSI_NN_TYPE_UINT8)
-    {
-        border.constant_value.U32 = (vx_uint32)in_zp;
-        border.constant_value.S16 = (vx_int16)in_zp;
-        border.constant_value.U8 = (vx_uint8)in_zp;
-    }
-    status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
 
 static vsi_status op_compute
     (
@@ -349,35 +48,44 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    vsi_nn_kernel_info_t kernel_info;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_kernel_node_t    n = NULL;
+    float eps = self->nn_param.instancenorm.eps;
+    uint32_t *input_size = inputs[0]->attr.size;
+    uint32_t dims_num = inputs[0]->attr.dim_num;
+    int32_t rs_flg = 0;
+    int32_t wh_flg = 0;
 
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_layernormalize";
-    kernel_info.type = vsi_nn_GetVXKernelTypeForShader();
-    kernel_info.kernel = vx_kernel_LAYERNORM_list;
-    kernel_info.init_index = 1;
+    param =vsi_nn_kernel_param_create();
 
-    if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type))
+    if (input_size[0] >= GPU_TENSOR_MAX_WIDTH)
     {
-        vx_op_pre_compute(self, inputs, outputs, &kernel_info);
+        wh_flg = 1;
     }
 
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name) free(kernel_info.resource_name);
-    if( NULL == self->n )
+    if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH)
+        && dims_num > 2)
     {
-        return VSI_FAILURE;
+        rs_flg = 1;
     }
 
-    if (NULL != op_compute_list[kernel_info.init_index])
+    vsi_nn_kernel_param_add_float32( param, "eps", eps );
+    vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg );
+    vsi_nn_kernel_param_add_int32( param, "wh_flg", wh_flg );
+    n = vsi_nn_kernel_selector( self->graph, "layer_norm",
+                    inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+    if ( n != NULL )
     {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
     }
+
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+
     return status;
 } /* op_compute() */
 
@@ -389,10 +97,12 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(LAYER_NORM, 3, 1)
+        IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32)
         IO_TYPE(D_F16,  D_F32,  D_F16,  D_F16)
         IO_TYPE(D_F16,  D_F32,  D_F16,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
     END_IO_TYPE_DECL(LAYER_NORM)
     if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -438,8 +148,8 @@ DEF_OP_REG
     /* check      */ op_check,
     /* setup      */ vsi_nn_op_common_setup,
     /* optimize   */ NULL,
-    /* input_num  */ 3,
-    /* output_num */ 1
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
     );
 #ifdef __cplusplus
 }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
index e61783d..f4b8efe 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
@@ -65,13 +65,13 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB );
 
     n = vsi_nn_kernel_selector( self->graph, "matrixmul", inputs, 2, outputs, 1, param );
-    if( n != NULL )
+    if ( n != NULL )
     {
         self->n = (vx_node)n;
         status = VSI_SUCCESS;
     }
 
-    if(param != NULL)
+    if (param != NULL)
     {
         vsi_nn_kernel_param_release( &param );
     }
@@ -103,15 +103,19 @@ static vsi_bool op_check
         IO_TYPE(D_I16|Q_DFP,  D_F16,  D_F16)
         IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_F16)
         IO_TYPE(D_F16,  D_I16|Q_DFP,  D_F16)
+        IO_TYPE(D_F16,  D_I16|Q_DFP,  D_I16|Q_DFP)
         IO_TYPE(D_F16,  D_I8|Q_DFP,   D_F16)
         IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_F16,  D_F16,  D_F16)
         IO_TYPE(D_F32,  D_F32,  D_F32)
+        IO_TYPE(D_F32,  D_I8|Q_DFP,  D_F32)
+        IO_TYPE(D_F32,  D_I16|Q_DFP,  D_F32)
+        IO_TYPE(D_F32,  D_I32,  D_F32)
         IO_TYPE(D_F16,  D_F16,  D_I16|Q_DFP)
         IO_TYPE(D_F16,  D_F16,  D_I8|Q_DFP)
         IO_TYPE(D_F16,  D_F16,  D_U8|Q_ASYM)
     END_IO_TYPE_DECL(MATRIXMUL)
-    if(!VALIDATE_OP_IO_TYPES(MATRIXMUL, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(MATRIXMUL, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -141,7 +145,7 @@ static vsi_bool op_check
          return FALSE;
     }
 
-    if(inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2
+    if (inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2
         && inputs[0]->attr.size[2] != 1 && inputs[1]->attr.size[2] != 1
         && inputs[0]->attr.size[2] != inputs[1]->attr.size[2])
     {
@@ -160,7 +164,7 @@ static vsi_bool op_setup
     )
 {
     uint32_t i = 0;
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = vsi_nn_max(inputs[0]->attr.dim_num, inputs[1]->attr.dim_num);
 
@@ -188,21 +192,21 @@ static vsi_bool op_setup
             return FALSE;
         }
 
-        if(inputs[0]->attr.dim_num > inputs[1]->attr.dim_num)
+        if (inputs[0]->attr.dim_num > inputs[1]->attr.dim_num)
         {
             for (i = 2; i < inputs[0]->attr.dim_num; i++)
             {
                 outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
             }
         }
-        else if(inputs[1]->attr.dim_num > inputs[0]->attr.dim_num)
+        else if (inputs[1]->attr.dim_num > inputs[0]->attr.dim_num)
         {
             for (i = 2; i < inputs[1]->attr.dim_num; i++)
             {
                 outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
             }
         }
-        else if(inputs[0]->attr.size[2] >= inputs[1]->attr.size[2])
+        else if (inputs[0]->attr.size[2] >= inputs[1]->attr.size[2])
         {
             for (i = 2; i < inputs[0]->attr.dim_num; i++)
             {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
index c1877de..ccb0510 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@@ -81,295 +81,413 @@ static vsi_bool op_setup
 {
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_internal_node_t* curr = NULL;
-    vsi_nn_pre_process_param * p;
+    vsi_nn_pre_process_param * p = NULL;
     vsi_bool ret = TRUE;
+    vsi_nn_internal_tensor_t* preprocess_tensor = NULL;
+    vsi_nn_preprocess_dest_layout_e layout = VSI_NN_DEST_LAYOUT_NCHW;
 
     p = (vsi_nn_pre_process_param *)&(self->nn_param.pre_process);
 
     vsi_nn_internal_init_node_wksp( self );
 
-    if (p->type == VSI_NN_SOURCE_FORMAT_TENSOR)
-    {
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_TENSOR, 0, 0 );
-
-        curr->node->nn_param.pre_process_tensor.perm = p->perm;
-        curr->node->nn_param.pre_process_tensor.dim_num = p->dim_num;
-
-        curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
-        curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
-
-        vsi_nn_internal_setup_node(self, curr);
-    }
-    else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY)
-    {
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 );
-
-        curr->node->nn_param.pre_process_gray.mean = p->norm.mean[0];
-        curr->node->nn_param.pre_process_gray.scale = p->norm.scale;
-        curr->node->nn_param.pre_process_gray.rect.left = p->rect.left;
-        curr->node->nn_param.pre_process_gray.rect.top = p->rect.top;
-        curr->node->nn_param.pre_process_gray.rect.width = p->rect.width;
-        curr->node->nn_param.pre_process_gray.rect.height = p->rect.height;
-        curr->node->nn_param.pre_process_gray.output_attr.size = p->output_attr.size;
-        curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num;
-
-        curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
-        curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
-
-        vsi_nn_internal_setup_node(self, curr);
-    }
-    else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB)
-    {
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB, 0, 0 );
-
-        if (p->reverse_channel)
-        {
-            curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[2];
-            curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1];
-            curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[0];
-        }
-        else
-        {
-            curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[0];
-            curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1];
-            curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[2];
-        }
-
-        curr->node->nn_param.pre_process_rgb.rgb_scale = p->norm.scale;
-        curr->node->nn_param.pre_process_rgb.reverse_channel = p->reverse_channel;
-        curr->node->nn_param.pre_process_rgb.rect.left = p->rect.left;
-        curr->node->nn_param.pre_process_rgb.rect.top = p->rect.top;
-        curr->node->nn_param.pre_process_rgb.rect.width = p->rect.width;
-        curr->node->nn_param.pre_process_rgb.rect.height = p->rect.height;
-        curr->node->nn_param.pre_process_rgb.output_attr.size = p->output_attr.size;
-        curr->node->nn_param.pre_process_rgb.output_attr.dim_num = p->output_attr.dim_num;
-        curr->node->nn_param.pre_process_rgb.perm = p->perm;
-        curr->node->nn_param.pre_process_rgb.dim_num = p->dim_num;
-
-        curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
-        curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
-
-        vsi_nn_internal_setup_node(self, curr);
-    }
-    else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420)
-    {
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV420, 0, 0 );
-
-        if (p->reverse_channel)
-        {
-            curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[2];
-            curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1];
-            curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[0];
-        }
-        else
-        {
-            curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[0];
-            curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1];
-            curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[2];
-        }
-
-        curr->node->nn_param.pre_process_yuv420.rgb_scale = p->norm.scale;
-        curr->node->nn_param.pre_process_yuv420.reverse_channel = p->reverse_channel;
-        curr->node->nn_param.pre_process_yuv420.rect.left = p->rect.left;
-        curr->node->nn_param.pre_process_yuv420.rect.top = p->rect.top;
-        curr->node->nn_param.pre_process_yuv420.rect.width = p->rect.width;
-        curr->node->nn_param.pre_process_yuv420.rect.height = p->rect.height;
-        curr->node->nn_param.pre_process_yuv420.output_attr.size = p->output_attr.size;
-        curr->node->nn_param.pre_process_yuv420.output_attr.dim_num = p->output_attr.dim_num;
-        curr->node->nn_param.pre_process_yuv420.perm = p->perm;
-        curr->node->nn_param.pre_process_yuv420.dim_num = p->dim_num;
-
-        curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
-        curr->inputs[1] = inputs[PRE_PROCESS_INPUT1];
-        curr->inputs[2] = inputs[PRE_PROCESS_INPUT2];
-        curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
-
-        vsi_nn_internal_setup_node(self, curr);
-    }
-    else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA)
-    {
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_BGRA, 0, 0 );
-
-        if (p->reverse_channel)
-        {
-            curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[2];
-            curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1];
-            curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[0];
-        }
-        else
-        {
-            curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[0];
-            curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1];
-            curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[2];
-        }
-
-        curr->node->nn_param.pre_process_bgra.rgb_scale = p->norm.scale;
-        curr->node->nn_param.pre_process_bgra.reverse_channel = p->reverse_channel;
-        curr->node->nn_param.pre_process_bgra.rect.left = p->rect.left;
-        curr->node->nn_param.pre_process_bgra.rect.top = p->rect.top;
-        curr->node->nn_param.pre_process_bgra.rect.width = p->rect.width;
-        curr->node->nn_param.pre_process_bgra.rect.height = p->rect.height;
-        curr->node->nn_param.pre_process_bgra.output_attr.size = p->output_attr.size;
-        curr->node->nn_param.pre_process_bgra.output_attr.dim_num = p->output_attr.dim_num;
-        curr->node->nn_param.pre_process_bgra.perm = p->perm;
-        curr->node->nn_param.pre_process_bgra.dim_num = p->dim_num;
-
-        curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
-        curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
-
-        vsi_nn_internal_setup_node(self, curr);
-    }
-    else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR)
+    if ( p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420        ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444        ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12          ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB           ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA          ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR
+        )
     {
         uint32_t i = 0;
-        uint32_t axis = 2;
-        uint32_t group = 3;
-        vsi_nn_tensor_t ** input_tensor_group = &p->local->local_tensor[0];
-        vsi_nn_internal_tensor_t * output_tensor_group[3] = {NULL};
-        vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL };
+        uint32_t _axis = 0;
         vsi_nn_tensor_attr_t attr;
-        float mean[3] = {0};
+        vsi_bool use_virtual_tensor = TRUE;
 
-        memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
 
-        ret = vsi_nn_CreateTensorGroup(self->graph, inputs[0], axis,
-        input_tensor_group, group);
-        if (ret == FALSE)
+        for (i = 0; i < p->dim_num; i++)
         {
-            goto final;
+            _axis = p->perm[i];
+            if (_axis != i)
+                break;
         }
 
-        memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t));
-        memcpy(&attr.size, p->output_attr.size, p->output_attr.dim_num * sizeof(uint32_t));
-        attr.size[axis] = 1;
-        attr.vtl = TRUE;
-        attr.is_const = FALSE;
-        output_tensor_group[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
-        output_tensor_group[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
-        output_tensor_group[2] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
-
-        if (p->reverse_channel)
+        if (i != self->nn_param.pre_process_rgb.dim_num)
         {
-            int32_t order[3] = {2, 1, 0};
-
-            mean[0] = p->norm.mean[2];
-            mean[1] = p->norm.mean[1];
-            mean[2] = p->norm.mean[0];
-
-            vsi_nn_reorder_tensor( (vsi_nn_tensor_t **)output_tensor_group, order,
-                    3, (vsi_nn_tensor_t **)tmp_outputs );
-        }
-        else
-        {
-            mean[0] = p->norm.mean[0];
-            mean[1] = p->norm.mean[1];
-            mean[2] = p->norm.mean[2];
-
-            memmove( tmp_outputs, output_tensor_group, sizeof(vsi_nn_tensor_t*) * 3 );
+            layout = VSI_NN_DEST_LAYOUT_NHWC;
         }
 
-        for (i = 0; i < 3; i++)
+        if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+        {
+            memcpy( &attr, &outputs[PRE_PROCESS_OUTPUT]->attr, sizeof( attr ) );
+            attr.size[0] = p->output_attr.size[1];
+            attr.size[1] = p->output_attr.size[2];
+            attr.size[2] = p->output_attr.size[0];
+            p->output_attr.size[0] = attr.size[0];
+            p->output_attr.size[1] = attr.size[1];
+            p->output_attr.size[2] = attr.size[2];
+            attr.vtl = use_virtual_tensor;
+            attr.is_const = FALSE;
+
+            preprocess_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        }
+    }
+
+    switch (p->type)
+    {
+    case VSI_NN_SOURCE_FORMAT_TENSOR:
+        {
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_TENSOR, 0, 0 );
+
+            curr->node->nn_param.pre_process_tensor.perm = p->perm;
+            curr->node->nn_param.pre_process_tensor.dim_num = p->dim_num;
+
+            curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
+            curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
+
+            vsi_nn_internal_setup_node(self, curr);
+        }
+        break;
+    case VSI_NN_SOURCE_FORMAT_IMAGE_GRAY:
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 );
 
-            curr->node->nn_param.pre_process_gray.mean = mean[i];
+            curr->node->nn_param.pre_process_gray.mean = p->norm.mean[0];
             curr->node->nn_param.pre_process_gray.scale = p->norm.scale;
             curr->node->nn_param.pre_process_gray.rect.left = p->rect.left;
             curr->node->nn_param.pre_process_gray.rect.top = p->rect.top;
             curr->node->nn_param.pre_process_gray.rect.width = p->rect.width;
             curr->node->nn_param.pre_process_gray.rect.height = p->rect.height;
-            curr->node->nn_param.pre_process_gray.output_attr.size = attr.size;
+            curr->node->nn_param.pre_process_gray.output_attr.size = p->output_attr.size;
             curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num;
 
-            curr->inputs[0] = input_tensor_group[i];
-            curr->outputs[0] = output_tensor_group[i]->t;
+            curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
+            curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
 
             vsi_nn_internal_setup_node(self, curr);
         }
+        break;
+    case VSI_NN_SOURCE_FORMAT_IMAGE_RGB:
+        {
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB, 0, 0 );
 
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 3, 1 );
+            if (p->reverse_channel)
+            {
+                curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[2];
+                curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[0];
+            }
+            else
+            {
+                curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[0];
+                curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[2];
+            }
 
-        curr->node->nn_param.concat.axis = axis;
-        curr->inputs[0] = tmp_outputs[0]->t;
-        curr->inputs[1] = tmp_outputs[1]->t;
-        curr->inputs[2] = tmp_outputs[2]->t;
-        curr->outputs[0] = outputs[0];
+            curr->node->nn_param.pre_process_rgb.rgb_scale = p->norm.scale;
+            curr->node->nn_param.pre_process_rgb.reverse_channel = p->reverse_channel;
+            curr->node->nn_param.pre_process_rgb.rect.left = p->rect.left;
+            curr->node->nn_param.pre_process_rgb.rect.top = p->rect.top;
+            curr->node->nn_param.pre_process_rgb.rect.width = p->rect.width;
+            curr->node->nn_param.pre_process_rgb.rect.height = p->rect.height;
+            curr->node->nn_param.pre_process_rgb.output_attr.size = p->output_attr.size;
+            curr->node->nn_param.pre_process_rgb.output_attr.dim_num = p->output_attr.dim_num;
+            curr->node->nn_param.pre_process_rgb.perm = p->perm;
+            curr->node->nn_param.pre_process_rgb.dim_num = p->dim_num;
 
-        vsi_nn_internal_setup_node(self, curr);
+            curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
+            if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+            {
+                curr->outputs[0] = preprocess_tensor->t;
+            }
+            else
+            {
+                curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
+            }
+
+            vsi_nn_internal_setup_node(self, curr);
+        }
+        break;
+    case VSI_NN_SOURCE_FORMAT_IMAGE_YUV420:
+        {
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV420, 0, 0 );
+
+            if (p->reverse_channel)
+            {
+                curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[2];
+                curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[0];
+            }
+            else
+            {
+                curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[0];
+                curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[2];
+            }
+
+            curr->node->nn_param.pre_process_yuv420.rgb_scale = p->norm.scale;
+            curr->node->nn_param.pre_process_yuv420.reverse_channel = p->reverse_channel;
+            curr->node->nn_param.pre_process_yuv420.rect.left = p->rect.left;
+            curr->node->nn_param.pre_process_yuv420.rect.top = p->rect.top;
+            curr->node->nn_param.pre_process_yuv420.rect.width = p->rect.width;
+            curr->node->nn_param.pre_process_yuv420.rect.height = p->rect.height;
+            curr->node->nn_param.pre_process_yuv420.output_attr.size = p->output_attr.size;
+            curr->node->nn_param.pre_process_yuv420.output_attr.dim_num = p->output_attr.dim_num;
+            curr->node->nn_param.pre_process_yuv420.perm = p->perm;
+            curr->node->nn_param.pre_process_yuv420.dim_num = p->dim_num;
+
+            curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
+            curr->inputs[1] = inputs[PRE_PROCESS_INPUT1];
+            curr->inputs[2] = inputs[PRE_PROCESS_INPUT2];
+            if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+            {
+                curr->outputs[0] = preprocess_tensor->t;
+            }
+            else
+            {
+                curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
+            }
+
+            vsi_nn_internal_setup_node(self, curr);
+        }
+        break;
+    case VSI_NN_SOURCE_FORMAT_IMAGE_BGRA:
+        {
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_BGRA, 0, 0 );
+
+            if (p->reverse_channel)
+            {
+                curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[2];
+                curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[0];
+            }
+            else
+            {
+                curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[0];
+                curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[2];
+            }
+
+            curr->node->nn_param.pre_process_bgra.rgb_scale = p->norm.scale;
+            curr->node->nn_param.pre_process_bgra.reverse_channel = p->reverse_channel;
+            curr->node->nn_param.pre_process_bgra.rect.left = p->rect.left;
+            curr->node->nn_param.pre_process_bgra.rect.top = p->rect.top;
+            curr->node->nn_param.pre_process_bgra.rect.width = p->rect.width;
+            curr->node->nn_param.pre_process_bgra.rect.height = p->rect.height;
+            curr->node->nn_param.pre_process_bgra.output_attr.size = p->output_attr.size;
+            curr->node->nn_param.pre_process_bgra.output_attr.dim_num = p->output_attr.dim_num;
+            curr->node->nn_param.pre_process_bgra.perm = p->perm;
+            curr->node->nn_param.pre_process_bgra.dim_num = p->dim_num;
+
+            curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
+            if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+            {
+                curr->outputs[0] = preprocess_tensor->t;
+            }
+            else
+            {
+                curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
+            }
+
+            vsi_nn_internal_setup_node(self, curr);
+        }
+        break;
+    case VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR:
+        {
+            uint32_t i = 0;
+            uint32_t axis = 2;
+            uint32_t group = 3;
+            vsi_nn_tensor_t ** input_tensor_group = &p->local->local_tensor[0];
+            vsi_nn_internal_tensor_t * output_tensor_group[3] = {NULL};
+            vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL };
+            vsi_nn_tensor_attr_t attr;
+            float mean[3] = {0};
+
+            memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+
+            ret = vsi_nn_CreateTensorGroup(self->graph, inputs[0], axis,
+            input_tensor_group, group);
+            if (ret == FALSE)
+            {
+                goto final;
+            }
+
+            memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t));
+            memcpy(&attr.size, p->output_attr.size, p->output_attr.dim_num * sizeof(uint32_t));
+            attr.size[axis] = 1;
+            attr.vtl = TRUE;
+            attr.is_const = FALSE;
+            output_tensor_group[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            output_tensor_group[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            output_tensor_group[2] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+
+            if (p->reverse_channel)
+            {
+                int32_t order[3] = {2, 1, 0};
+
+                mean[0] = p->norm.mean[2];
+                mean[1] = p->norm.mean[1];
+                mean[2] = p->norm.mean[0];
+
+                vsi_nn_reorder_tensor( (vsi_nn_tensor_t **)output_tensor_group, order,
+                        3, (vsi_nn_tensor_t **)tmp_outputs );
+            }
+            else
+            {
+                mean[0] = p->norm.mean[0];
+                mean[1] = p->norm.mean[1];
+                mean[2] = p->norm.mean[2];
+
+                memmove( tmp_outputs, output_tensor_group, sizeof(vsi_nn_tensor_t*) * 3 );
+            }
+
+            for (i = 0; i < 3; i++)
+            {
+                curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 );
+
+                curr->node->nn_param.pre_process_gray.mean = mean[i];
+                curr->node->nn_param.pre_process_gray.scale = p->norm.scale;
+                curr->node->nn_param.pre_process_gray.rect.left = p->rect.left;
+                curr->node->nn_param.pre_process_gray.rect.top = p->rect.top;
+                curr->node->nn_param.pre_process_gray.rect.width = p->rect.width;
+                curr->node->nn_param.pre_process_gray.rect.height = p->rect.height;
+                curr->node->nn_param.pre_process_gray.output_attr.size = attr.size;
+                curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num;
+
+                curr->inputs[0] = input_tensor_group[i];
+                curr->outputs[0] = output_tensor_group[i]->t;
+
+                vsi_nn_internal_setup_node(self, curr);
+            }
+
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 3, 1 );
+
+            curr->node->nn_param.concat.axis = axis;
+            curr->inputs[0] = tmp_outputs[0]->t;
+            curr->inputs[1] = tmp_outputs[1]->t;
+            curr->inputs[2] = tmp_outputs[2]->t;
+            if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+            {
+                curr->outputs[0] = preprocess_tensor->t;
+            }
+            else
+            {
+                curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
+            }
+
+            vsi_nn_internal_setup_node(self, curr);
+        }
+        break;
+    case VSI_NN_SOURCE_FORMAT_IMAGE_YUV444:
+        {
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV444, 0, 0 );
+
+            if (p->reverse_channel)
+            {
+                curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[2];
+                curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[0];
+            }
+            else
+            {
+                curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[0];
+                curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[2];
+            }
+
+            curr->node->nn_param.pre_process_yuv444.rgb_scale = p->norm.scale;
+            curr->node->nn_param.pre_process_yuv444.reverse_channel = p->reverse_channel;
+            curr->node->nn_param.pre_process_yuv444.rect.left = p->rect.left;
+            curr->node->nn_param.pre_process_yuv444.rect.top = p->rect.top;
+            curr->node->nn_param.pre_process_yuv444.rect.width = p->rect.width;
+            curr->node->nn_param.pre_process_yuv444.rect.height = p->rect.height;
+            curr->node->nn_param.pre_process_yuv444.output_attr.size = p->output_attr.size;
+            curr->node->nn_param.pre_process_yuv444.output_attr.dim_num = p->output_attr.dim_num;
+            curr->node->nn_param.pre_process_yuv444.perm = p->perm;
+            curr->node->nn_param.pre_process_yuv444.dim_num = p->dim_num;
+
+            curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
+            curr->inputs[1] = inputs[PRE_PROCESS_INPUT1];
+            curr->inputs[2] = inputs[PRE_PROCESS_INPUT2];
+            if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+            {
+                curr->outputs[0] = preprocess_tensor->t;
+            }
+            else
+            {
+                curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
+            }
+
+            vsi_nn_internal_setup_node(self, curr);
+        }
+        break;
+    case VSI_NN_SOURCE_FORMAT_IMAGE_NV12:
+        {
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_NV12, 0, 0 );
+
+            if (p->reverse_channel)
+            {
+                curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[2];
+                curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[0];
+            }
+            else
+            {
+                curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[0];
+                curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[2];
+            }
+
+            curr->node->nn_param.pre_process_nv12.rgb_scale = p->norm.scale;
+            curr->node->nn_param.pre_process_nv12.reverse_channel = p->reverse_channel;
+            curr->node->nn_param.pre_process_nv12.rect.left = p->rect.left;
+            curr->node->nn_param.pre_process_nv12.rect.top = p->rect.top;
+            curr->node->nn_param.pre_process_nv12.rect.width = p->rect.width;
+            curr->node->nn_param.pre_process_nv12.rect.height = p->rect.height;
+            curr->node->nn_param.pre_process_nv12.output_attr.size = p->output_attr.size;
+            curr->node->nn_param.pre_process_nv12.output_attr.dim_num = p->output_attr.dim_num;
+            curr->node->nn_param.pre_process_nv12.perm = p->perm;
+            curr->node->nn_param.pre_process_nv12.dim_num = p->dim_num;
+
+            curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
+            curr->inputs[1] = inputs[PRE_PROCESS_INPUT1];
+            if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+            {
+                curr->outputs[0] = preprocess_tensor->t;
+            }
+            else
+            {
+                curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
+            }
+
+            vsi_nn_internal_setup_node(self, curr);
+        }
+        break;
+    default:
+        {
+            VSILOGE( "Not support this type!(PRE_PROCESS)\n");
+            ret = FALSE;
+        }
+        break;
     }
-    else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444)
+
+    if ( p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420        ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444        ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12          ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB           ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA          ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR
+        )
     {
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV444, 0, 0 );
-
-        if (p->reverse_channel)
+        if (layout == VSI_NN_DEST_LAYOUT_NHWC)
         {
-            curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[2];
-            curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1];
-            curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[0];
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+            curr->node->nn_param.permute.perm = p->perm;
+            curr->node->nn_param.permute.dim_num = p->dim_num;
+            curr->inputs[0] = preprocess_tensor->t;
+            curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
+
+            vsi_nn_internal_setup_node( self, curr );
         }
-        else
-        {
-            curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[0];
-            curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1];
-            curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[2];
-        }
-
-        curr->node->nn_param.pre_process_yuv444.rgb_scale = p->norm.scale;
-        curr->node->nn_param.pre_process_yuv444.reverse_channel = p->reverse_channel;
-        curr->node->nn_param.pre_process_yuv444.rect.left = p->rect.left;
-        curr->node->nn_param.pre_process_yuv444.rect.top = p->rect.top;
-        curr->node->nn_param.pre_process_yuv444.rect.width = p->rect.width;
-        curr->node->nn_param.pre_process_yuv444.rect.height = p->rect.height;
-        curr->node->nn_param.pre_process_yuv444.output_attr.size = p->output_attr.size;
-        curr->node->nn_param.pre_process_yuv444.output_attr.dim_num = p->output_attr.dim_num;
-        curr->node->nn_param.pre_process_yuv444.perm = p->perm;
-        curr->node->nn_param.pre_process_yuv444.dim_num = p->dim_num;
-
-        curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
-        curr->inputs[1] = inputs[PRE_PROCESS_INPUT1];
-        curr->inputs[2] = inputs[PRE_PROCESS_INPUT2];
-        curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
-
-        vsi_nn_internal_setup_node(self, curr);
-    }
-    else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12)
-    {
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_NV12, 0, 0 );
-
-        if (p->reverse_channel)
-        {
-            curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[2];
-            curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1];
-            curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[0];
-        }
-        else
-        {
-            curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[0];
-            curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1];
-            curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[2];
-        }
-
-        curr->node->nn_param.pre_process_nv12.rgb_scale = p->norm.scale;
-        curr->node->nn_param.pre_process_nv12.reverse_channel = p->reverse_channel;
-        curr->node->nn_param.pre_process_nv12.rect.left = p->rect.left;
-        curr->node->nn_param.pre_process_nv12.rect.top = p->rect.top;
-        curr->node->nn_param.pre_process_nv12.rect.width = p->rect.width;
-        curr->node->nn_param.pre_process_nv12.rect.height = p->rect.height;
-        curr->node->nn_param.pre_process_nv12.output_attr.size = p->output_attr.size;
-        curr->node->nn_param.pre_process_nv12.output_attr.dim_num = p->output_attr.dim_num;
-        curr->node->nn_param.pre_process_nv12.perm = p->perm;
-        curr->node->nn_param.pre_process_nv12.dim_num = p->dim_num;
-
-        curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
-        curr->inputs[1] = inputs[PRE_PROCESS_INPUT1];
-        curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
-
-        vsi_nn_internal_setup_node(self, curr);
-    }
-    else
-    {
-        VSILOGE( "Not support this type!(PRE_PROCESS)\n");
-        return FALSE;
     }
 
 final:
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
index c0889c6..4ac9bb1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
@@ -109,7 +109,6 @@ static vsi_bool op_setup
 {
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_pre_process_bgra_param * p = NULL;
-    uint32_t axis = 0;
     uint32_t i = 0;
     p = (vsi_nn_pre_process_bgra_param *)&(self->nn_param.pre_process_bgra);
 
@@ -155,28 +154,8 @@ static vsi_bool op_setup
         }
     }
 
-    for (i = 0; i < self->nn_param.pre_process_bgra.dim_num; i++)
-    {
-        axis = self->nn_param.pre_process_bgra.perm[i];
-        if (axis != i)
-            break;
-    }
-
-    if (i == self->nn_param.pre_process_bgra.dim_num)
-        self->nn_param.pre_process_bgra.local.enable_perm = FALSE;
-    else
-        self->nn_param.pre_process_bgra.local.enable_perm = TRUE;
-
-    if (self->nn_param.pre_process_bgra.local.enable_perm == FALSE)
-    {
-        p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0];
-        p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1];
-    }
-    else
-    {
-        p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1];
-        p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2];
-    }
+    p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0];
+    p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1];
 
     p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15)));
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
index d0f1454..d754e27 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
@@ -107,7 +107,6 @@ static vsi_bool op_setup
 {
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_pre_process_nv12_param * p = NULL;
-    uint32_t axis = 0;
     uint32_t i = 0;
     p = (vsi_nn_pre_process_nv12_param *)&(self->nn_param.pre_process_nv12);
 
@@ -153,28 +152,8 @@ static vsi_bool op_setup
         }
     }
 
-    for (i = 0; i < self->nn_param.pre_process_nv12.dim_num; i++)
-    {
-        axis = self->nn_param.pre_process_nv12.perm[i];
-        if (axis != i)
-            break;
-    }
-
-    if (i == self->nn_param.pre_process_nv12.dim_num)
-        self->nn_param.pre_process_nv12.local->enable_perm = FALSE;
-    else
-        self->nn_param.pre_process_nv12.local->enable_perm = TRUE;
-
-    if (self->nn_param.pre_process_nv12.local->enable_perm == FALSE)
-    {
-        p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0];
-        p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1];
-    }
-    else
-    {
-        p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1];
-        p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2];
-    }
+    p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0];
+    p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1];
 
     p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
index bd9e5c8..a31005d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
@@ -112,7 +112,6 @@ static vsi_bool op_setup
 {
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_pre_process_rgb_param * p = NULL;
-    uint32_t axis = 0;
     uint32_t i = 0;
     p = (vsi_nn_pre_process_rgb_param *)&(self->nn_param.pre_process_rgb);
 
@@ -158,17 +157,8 @@ static vsi_bool op_setup
         }
     }
 
-    for (i = 0; i < self->nn_param.pre_process_rgb.dim_num; i++)
-    {
-        axis = self->nn_param.pre_process_rgb.perm[i];
-        if (axis != i)
-            break;
-    }
 
-    if (i == self->nn_param.pre_process_rgb.dim_num)
-        self->nn_param.pre_process_rgb.local.enable_perm = FALSE;
-    else
-        self->nn_param.pre_process_rgb.local.enable_perm = TRUE;
+    self->nn_param.pre_process_rgb.local.enable_perm = FALSE;
 
     if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE)
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
index 3fe0c49..50c2355 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
@@ -108,7 +108,6 @@ static vsi_bool op_setup
 {
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_pre_process_yuv420_param * p = NULL;
-    uint32_t axis = 0;
     uint32_t i = 0;
     p = (vsi_nn_pre_process_yuv420_param *)&(self->nn_param.pre_process_yuv420);
 
@@ -154,28 +153,8 @@ static vsi_bool op_setup
         }
     }
 
-    for (i = 0; i < self->nn_param.pre_process_yuv420.dim_num; i++)
-    {
-        axis = self->nn_param.pre_process_yuv420.perm[i];
-        if (axis != i)
-            break;
-    }
-
-    if (i == self->nn_param.pre_process_yuv420.dim_num)
-        self->nn_param.pre_process_yuv420.local.enable_perm = FALSE;
-    else
-        self->nn_param.pre_process_yuv420.local.enable_perm = TRUE;
-
-    if (self->nn_param.pre_process_yuv420.local.enable_perm == FALSE)
-    {
-        p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0];
-        p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1];
-    }
-    else
-    {
-        p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1];
-        p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2];
-    }
+    p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0];
+    p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1];
 
     p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15)));
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
index 0d7d370..99a7674 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
@@ -108,7 +108,6 @@ static vsi_bool op_setup
 {
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_pre_process_yuv444_param * p = NULL;
-    uint32_t axis = 0;
     uint32_t i = 0;
     p = (vsi_nn_pre_process_yuv444_param *)&(self->nn_param.pre_process_yuv444);
 
@@ -154,28 +153,8 @@ static vsi_bool op_setup
         }
     }
 
-    for (i = 0; i < self->nn_param.pre_process_yuv444.dim_num; i++)
-    {
-        axis = self->nn_param.pre_process_yuv444.perm[i];
-        if (axis != i)
-            break;
-    }
-
-    if (i == self->nn_param.pre_process_yuv444.dim_num)
-        self->nn_param.pre_process_yuv444.local->enable_perm = FALSE;
-    else
-        self->nn_param.pre_process_yuv444.local->enable_perm = TRUE;
-
-    if (self->nn_param.pre_process_yuv444.local->enable_perm == FALSE)
-    {
-        p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0];
-        p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1];
-    }
-    else
-    {
-        p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1];
-        p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2];
-    }
+    p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0];
+    p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1];
 
     p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index 323ee4a..3d01e79 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -43,8 +43,6 @@
 #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
 #define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
 
-#define USE_OVX_API TRUE
-
 typedef struct _vsi_nn_reduce_lcl2_data_t
 {
     vsi_nn_tensor_t *reshaped_input;
@@ -57,125 +55,6 @@ typedef struct _vsi_nn_reduce_lcl2_data_t
     int32_t axes_num;
 } vsi_nn_reduce_lcl2_data_t;
 
-#if (USE_OVX_API == FALSE)
-extern vx_kernel_description_t * vx_kernel_REDUCE_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vx_uint32 i;
-    vx_uint32 cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vx_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    vx_uint32 num
-    )
-{
-    vx_status status;
-    vx_context ctx;
-    vsi_nn_reduce_param * p = NULL;
-    if( 0 == num )
-    {
-        return VX_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &node->nn_param.reduce;
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VX_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, axis_num );
-    _SET_PARAM( 1, VX_TYPE_INT32, keep_dim );
-    _SET_PARAM( 2, VX_TYPE_INT32, axis[0] );
-    _SET_PARAM( 3, VX_TYPE_INT32, axis[1] );
-    _SET_PARAM( 4, VX_TYPE_INT32, axis[2] );
-    _SET_PARAM( 5, VX_TYPE_INT32, axis[3] );
-
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    vx_uint32 num
-    )
-{
-    vx_uint32 i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vx_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vx_status status = VX_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args = NULL;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VX_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    NULL
-};
-#endif
-
 static vsi_status op_comput_reduce_mean(vsi_nn_node_t * self,
                                         vsi_nn_tensor_t *axis_tensor,
                                         vx_bool keep_dim,
@@ -278,7 +157,6 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VSI_FAILURE;
-#if (USE_OVX_API == TRUE)
 
     if (self->nn_param.reduce.type == VSI_NN_REDUCE_MEAN)
     {
@@ -574,30 +452,6 @@ static vsi_status op_compute
         status = vsi_nn_internal_compute_node( self );
     }
 
-#else
-    vsi_nn_kernel_info_t kernel_info;
-
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_reduce";
-    kernel_info.type = VX_KERNEL_TYPE_CPU;
-    kernel_info.kernel = vx_kernel_REDUCE_list;
-    kernel_info.kernel_index = 0;
-    kernel_info.init_index = 0;
-
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name) free(kernel_info.resource_name);
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
-#endif
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
index 9fed06c..51ea588 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
@@ -50,215 +50,6 @@
 #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
 #define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
 
-#define USE_OVX_API TRUE
-
-#if (USE_OVX_API == FALSE)
-extern vx_kernel_description_t * vx_kernel_RESIZE_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_resize_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &node->nn_param.resize;
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, factor );
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status op_pre_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_kernel_info_t * kernel_info
-    )
-{
-    vsi_nn_type_e inputFormat = inputs[0]->attr.dtype.vx_type;
-    vsi_nn_type_e outputFormat = outputs[0]->attr.dtype.vx_type;
-    vsi_nn_type_e enableFormat;
-    float scale_factor = self->nn_param.resize.factor;
-
-    enableFormat = ((inputFormat == VSI_NN_TYPE_FLOAT16 && outputFormat == VSI_NN_TYPE_FLOAT16) ||
-        (inputFormat == VSI_NN_TYPE_INT16 && outputFormat == VSI_NN_TYPE_INT16) ||
-        (inputFormat == VSI_NN_TYPE_INT8 && outputFormat == VSI_NN_TYPE_INT8) ||
-        (inputFormat == VSI_NN_TYPE_UINT8 && outputFormat == VSI_NN_TYPE_UINT8));
-
-    if(scale_factor == 0.5f && enableFormat && inputs[0]->attr.size[1] % 2 == 0
-        && inputs[0]->attr.size[1] * inputs[0]->attr.size[2] < 65536)
-    {
-        kernel_info->type = VX_KERNEL_TYPE_VX;
-        kernel_info->init_index = 1;
-        if (inputFormat == VX_TYPE_FLOAT16  || inputFormat == VX_TYPE_INT16 )
-        {
-            kernel_info->kernel_index = 1;
-        }
-        else
-        {
-            kernel_info->kernel_index = 2;
-        }
-    }
-    else
-    {
-        kernel_info->type = VX_KERNEL_TYPE_CPU;
-        kernel_info->kernel_index = 0;
-        kernel_info->init_index = 0;
-    }
-
-    return VSI_SUCCESS;
-}
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_IO_NUM];
-    vx_border_t border;
-    int32_t sizes[4] = {0};
-    uint32_t dims   = 2;
-    uint32_t input_size[4] = {1, 1, 1, 1};
-    uint32_t output_size[4] = {1, 1, 1, 1};
-    uint32_t i;
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    for(i = 0; i < inputs[0]->attr.dim_num; ++i)
-    {
-        input_size[i] = inputs[0]->attr.size[i];
-    }
-    for(i = 0; i < outputs[0]->attr.dim_num; ++i)
-    {
-        output_size[i] = outputs[0]->attr.size[i];
-    }
-
-
-    sizes[0] = input_size[0];
-    sizes[1] = input_size[1] * input_size[2] * input_size[3];
-    self->nn_param.resize.local.local_tensor[0] = vxReshapeTensor(inputs[0]->t, sizes, dims);
-
-    sizes[0] = output_size[0];
-    sizes[1] = output_size[1] * output_size[2] * output_size[3];
-    self->nn_param.resize.local.local_tensor[1] = vxReshapeTensor(outputs[0]->t, sizes, dims);
-
-    params[0] = (vx_reference)self->nn_param.resize.local.local_tensor[0];
-    params[1] = (vx_reference)self->nn_param.resize.local.local_tensor[1];
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _IO_NUM );
-
-    border.mode = VX_BORDER_REPLICATE;
-    border.constant_value.U32 = 0;
-    status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
-#endif
 
 static vsi_bool _is_same_shape
     (
@@ -289,7 +80,7 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VSI_FAILURE;
-#if (USE_OVX_API == TRUE)
+
     if ( ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers)
        && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type
           || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type))
@@ -318,30 +109,7 @@ static vsi_status op_compute
             status = VSI_SUCCESS;
         }
     }
-#else
 
-    vsi_nn_kernel_info_t kernel_info;
-
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name = "vsi_nn_kernel_resize";
-    kernel_info.kernel = vx_kernel_RESIZE_list;
-
-    op_pre_compute(self, inputs, outputs, &kernel_info);
-
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name) free(kernel_info.resource_name);
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
-#endif
     return status;
 } /* op_compute() */
 
@@ -446,17 +214,6 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-#if (USE_OVX_API == FALSE)
-    uint32_t i;
-    for (i = 0; i < _VSI_NN_RESIZE_LOCAL_TENSOR_NUM; i++)
-    {
-        if (self->nn_param.resize.local.local_tensor[i] != NULL)
-        {
-            vxReleaseTensor(&(self->nn_param.resize.local.local_tensor[i]));
-            self->nn_param.resize.local.local_tensor[i] = NULL;
-        }
-    }
-#endif
     if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers)
        && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type
           || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type))
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
index 53f5bd4..472f994 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
@@ -31,166 +31,15 @@
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "utils/vsi_nn_math.h"
+#include "utils/vsi_nn_util.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "client/vsi_nn_vxkernel.h"
-
-#define _ARG_NUM            (6)
-#define _INPUT_NUM          (3)
-#define _OUTPUT_NUM         (1)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
-
-extern vx_kernel_description_t * vx_kernel_ROI_ALIGN_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_roi_align_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.roi_align);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-    #define _SET_PARAM( i, type, arg ) do{ \
-        params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-        status = vxGetStatus( params[i] ); \
-        if( VSI_SUCCESS != status ) { \
-            goto set_param_error; \
-            } \
-        } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, output_height );
-    _SET_PARAM( 1, VX_TYPE_INT32, output_width );
-    _SET_PARAM( 2, VX_TYPE_FLOAT32, height_ratio );
-    _SET_PARAM( 3, VX_TYPE_FLOAT32, width_ratio );
-    _SET_PARAM( 4, VX_TYPE_INT32, height_sample_num );
-    _SET_PARAM( 5, VX_TYPE_INT32, width_sample_num );
-    #undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-    /*TODO: Add code if need to change your parameter*/
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
+#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_internal_node.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_constraint_check.h"
 
 static vsi_status op_compute
     (
@@ -199,46 +48,31 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    vsi_nn_kernel_info_t kernel_info;
-    char *path = NULL;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    float width_ratio = self->nn_param.roi_align.width_ratio;
+    float height_ratio = self->nn_param.roi_align.height_ratio;
+    int32_t width_sample_num = self->nn_param.roi_align.width_sample_num;
+    int32_t height_sample_num = self->nn_param.roi_align.height_sample_num;
 
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.type = VX_KERNEL_TYPE_CPU;
-    kernel_info.kernel = vx_kernel_ROI_ALIGN_list;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_roi_align";
-    path = getenv("USER_VX_SOURCE_PATH");
-    if(path)
-        vsi_nn_VxResourceSetPath(path);
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_float32( param, "width_ratio",  width_ratio );
+    vsi_nn_kernel_param_add_float32( param, "height_ratio",  height_ratio );
+    vsi_nn_kernel_param_add_int32( param, "width_sample_num",  width_sample_num );
+    vsi_nn_kernel_param_add_int32( param, "height_sample_num",  height_sample_num );
 
-    if( kernel_info.type == VX_KERNEL_TYPE_VX)
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+        "roi_align",
+        inputs, 3,
+        outputs, 1, param );
+
+    if ( self->n )
     {
-        kernel_info.kernel_index = 1;
-        kernel_info.init_index = 1;
-    }
-    else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/
-    {
-        kernel_info.kernel_index = 0;
-        kernel_info.init_index = 0;
+        status = VSI_SUCCESS;
     }
 
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-            self->graph, &kernel_info);
-    if (kernel_info.resource_name)
-    {
-        free(kernel_info.resource_name);
-    }
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
+    vsi_nn_kernel_param_release( &param );
+
     return status;
 } /* op_compute() */
 
@@ -249,17 +83,10 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    /*TODO: Check tensor shapes. */
-    //If input0 is uint8, then input1 MUST be uint16,
-    //with zero point of 0 and scale of 0.125
-    if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 &&
-        inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_UINT16)
-    {
-        return FALSE;
-    }
     return TRUE;
 } /* op_check() */
 
+
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
@@ -267,19 +94,20 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    /* TODO: Add code to compute outputs' shape. */
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         vsi_nn_roi_align_param *p;
         p = &(self->nn_param.roi_align);
         outputs[0]->attr.dim_num = 4;
-        outputs[0]->attr.size[0] = inputs[0]->attr.size[0];
-        outputs[0]->attr.size[1] = p->output_width;
-        outputs[0]->attr.size[2] = p->output_height;
+        outputs[0]->attr.size[0] = p->output_width;
+        outputs[0]->attr.size[1] = p->output_height;
+        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
         outputs[0]->attr.size[3] = inputs[1]->attr.size[1];
     }
+
     return TRUE;
-} /* op_setup() */
+} /* op_init() */
+
 
 #ifdef __cplusplus
 extern "C" {
@@ -294,8 +122,8 @@ DEF_OP_REG
     /* check      */ op_check,
     /* setup      */ op_setup,
     /* optimize   */ NULL,
-    /* input_num  */ _INPUT_NUM,
-    /* output_num */ _OUTPUT_NUM
+    /* input_num  */ 3,
+    /* output_num */ 1
     );
 #ifdef __cplusplus
 }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
index 61b9d13..b7c4056 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
@@ -26,201 +26,21 @@
 
 #include "vsi_nn_types.h"
 #include "vsi_nn_platform.h"
+#include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "utils/vsi_nn_math.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
+#include "utils/vsi_nn_link_list.h"
+#include "utils/vsi_nn_dtype_util.h"
 #include "client/vsi_nn_vxkernel.h"
-#include "utils/vsi_nn_constraint_check.h"
 
-#define _ARG_NUM            (2)
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
-
-extern vx_kernel_description_t * vx_kernel_SCALE_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_scale_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = (vsi_nn_scale_param *)node->nn_param.client_param;
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, axis );
-    _SET_PARAM( 1, VX_TYPE_FLOAT32, bias );
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static void reshape_tensor_shape
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t * input,
-    vx_reference * params,
-    uint32_t index
-    )
-{
-    uint32_t i;
-    int32_t size[4] = {0};
-    int32_t size0[4] = {1, 1, 1, 1};
-    uint32_t dims = 2;
-
-    for( i = 0; i < input->attr.dim_num; i++ )
-    {
-        size0[i] = input->attr.size[i];
-    }
-
-    size[0] = size0[0];
-    size[1] = size0[1] * size0[2] * size0[3];
-
-    self->nn_param.scale.local.local_tensor[index] =
-        vxReshapeTensor(input->t, size, dims);
-    params[index] = (vx_reference)self->nn_param.scale.local.local_tensor[index];
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_IO_NUM];
-    vx_border_t border;
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    if (inputs[0]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 ||
-        inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 ||
-        inputs[2]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 ||
-        outputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16)
-    {
-        VSILOGE("scale shader unsuport format!\n");
-        return VSI_FAILURE;
-    }
-
-    reshape_tensor_shape(self, inputs[0], params, 0);
-    reshape_tensor_shape(self, inputs[1], params, 1);
-    reshape_tensor_shape(self, inputs[2], params, 2);
-    reshape_tensor_shape(self, outputs[0], params, 3);
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _IO_NUM );
-
-    border.mode = VX_BORDER_REPLICATE;
-    border.constant_value.U32 = 0;
-    status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_init_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
 
 static vsi_status op_compute
     (
@@ -229,31 +49,7 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    vsi_nn_kernel_info_t kernel_info;
-
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_scale";
-    kernel_info.type = vsi_nn_GetVXKernelTypeForShader();
-    kernel_info.kernel = vx_kernel_SCALE_list;
-    kernel_info.kernel_index = 1;
-    kernel_info.init_index = 1;
-
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name) free(kernel_info.resource_name);
-    if( NULL == self->n )
-    {
-        return status;
-    }
-    if (NULL != op_init_list[kernel_info.init_index])
-    {
-        status = op_init_list[kernel_info.init_index](self, inputs, outputs);
-    }
-    return status;
+    return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
 static vsi_bool op_check
@@ -263,38 +59,55 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    BEGIN_IO_TYPE_DECL(SCALE, 3, 1)
-        IO_TYPE(D_F16, D_F16, D_F32, D_F16)
-    END_IO_TYPE_DECL(SCALE)
-    if(!VALIDATE_OP_IO_TYPES(SCALE, self, inputs, self->input.num, outputs, self->output.num)) {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
+    /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
 
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * node,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_internal_node_t* curr = NULL;
+    vsi_bool ret = TRUE;
+
+    vsi_nn_internal_init_node_wksp( node );
+
+    curr = vsi_nn_internal_new_node( node, VSI_NN_OP_A_TIMES_B_PLUS_C, node->input.num, node->output.num );
+    curr->inputs[0] = inputs[0];
+    curr->inputs[1] = inputs[1];
+    curr->inputs[2] = inputs[2];
+    curr->outputs[0] = outputs[0];
+
+    vsi_nn_internal_setup_node(node, curr);
+
+    return ret;
+} /* op_setup() */
+
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    return vsi_nn_internal_optimize_node( self, direction );
+} /* op_optimize() */
+
 static vsi_status op_deinit
     (
     vsi_nn_node_t * self
     )
 {
-    uint32_t i;
-    for (i = 0; i < _VSI_NN_SCALE_LOCAL_TENSOR_NUM; i++)
-    {
-        if (self->nn_param.scale.local.local_tensor[i] != NULL)
-        {
-            vxReleaseTensor(&(self->nn_param.scale.local.local_tensor[i]));
-            self->nn_param.scale.local.local_tensor[i] = NULL;
-        }
-    }
+    vsi_nn_internal_deinit_node_wksp( self );
+
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
-} /* op_deinit() */
-
+}
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -306,12 +119,11 @@ DEF_OP_REG
     /* compute    */ op_compute,
     /* deinit     */ op_deinit,
     /* check      */ op_check,
-    /* setup      */ vsi_nn_op_common_setup,
-    /* optimize   */ NULL,
+    /* setup      */ op_setup,
+    /* optimize   */ op_optimize,
     /* input_num  */ _INPUT_NUM,
     /* output_num */ _OUTPUT_NUM
     );
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c
index 5c02808..b87e1e6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c
@@ -38,265 +38,12 @@
 #include "client/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
-#define USE_OVXLIB (0)
-
 #define _ARG_NUM            (2)
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
 #define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
 
-#if (USE_OVXLIB)
-
-extern vx_kernel_description_t * vx_kernel_SHUFFLECHANNEL_list[];
-
-static vsi_bool _reshape_tensor
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i = 0;
-    uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {0};
-    int32_t axis = 0;
-    vsi_nn_shufflechannel_param * p = NULL;
-    uint32_t before_size = 1;
-    uint32_t after_size = 1;
-    uint32_t * input_sizes = inputs[0]->attr.size;
-    uint32_t dims = inputs[0]->attr.dim_num;
-
-    p = &(self->nn_param.shufflechannel);
-    axis = p->axis;
-
-    for ( i = 0; i < (uint32_t)axis; i++)
-    {
-        before_size *= input_sizes[i];
-    }
-    for ( i = axis + 1; i < dims; i++)
-    {
-        after_size *= input_sizes[i];
-    }
-
-    if (axis == 2 && after_size == 1)
-    {
-        sizes[0] = input_sizes[0];
-        sizes[1] = input_sizes[1];
-        sizes[2] = input_sizes[2];
-    }
-    else
-    {
-        sizes[0] = before_size;
-        sizes[1] = input_sizes[axis];
-        sizes[2] = after_size;
-        p->axis = 1;
-    }
-    dims = 3;
-
-    p->local->input_tensor = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, dims);
-    p->local->output_tensor = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, dims);
-
-    return TRUE;
-}
-
-static void _set_inputs_outputs
-    (
-    vsi_nn_node_t * self,
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_nn_shufflechannel_param * p = NULL;
-
-    p = &(self->nn_param.shufflechannel);
-
-    params[0] = (vx_reference)p->local->input_tensor;
-    params[1] = (vx_reference)p->local->output_tensor;
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_context ctx;
-    vsi_nn_shufflechannel_param * p = NULL;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.shufflechannel);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, group_number );
-    _SET_PARAM( 1, VX_TYPE_INT32, axis );
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i = 0;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args = NULL;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( self, params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_pre_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_kernel_info_t * kernel_info
-    )
-{
-    vsi_nn_type_e inputDataFormat     = inputs[0]->attr.dtype.vx_type;
-    vsi_nn_type_e outputDataFormat    = outputs[0]->attr.dtype.vx_type;
-    int8_t        inputFixedPointPos  = inputs[0]->attr.dtype.fl;
-    int8_t        outputFixedPointPos = outputs[0]->attr.dtype.fl;
-    int32_t       inputZeroPoint      = inputs[0]->attr.dtype.zero_point;
-    int32_t       outputZeroPoint     = outputs[0]->attr.dtype.zero_point;
-    vx_float32    inputScale          = inputs[0]->attr.dtype.scale;
-    vx_float32    outputScale         = outputs[0]->attr.dtype.scale;
-    int32_t       axis                = self->nn_param.shufflechannel.axis;
-    uint32_t      *sizes              = inputs[0]->attr.size;
-    vsi_bool      is16Bits            = FALSE;
-    vsi_bool      is8Bits             = FALSE;
-
-    is16Bits = ((inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16)
-            || (inputDataFormat == VSI_NN_TYPE_INT16 && outputDataFormat == VSI_NN_TYPE_INT16
-            && inputFixedPointPos == outputFixedPointPos)) ? TRUE : FALSE;
-    is8Bits = ((inputDataFormat == VSI_NN_TYPE_INT8 && outputDataFormat == VSI_NN_TYPE_INT8
-            && inputFixedPointPos == outputFixedPointPos)
-            || (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8
-            && inputZeroPoint == outputZeroPoint && inputScale == outputScale)) ? TRUE : FALSE;
-#define VSI_NN_TENSOR_WIDTH_MAX (65536)
-    kernel_info->kernel_index = 0;
-    if (sizes[0] < VSI_NN_TENSOR_WIDTH_MAX && sizes[1] < VSI_NN_TENSOR_WIDTH_MAX)
-    {
-        if ( is16Bits && axis == 2 )
-        {
-            kernel_info->kernel_index = 1;
-        }
-        else if ( is8Bits && axis == 2)
-        {
-            kernel_info->kernel_index = 2;
-        }
-        else if ( is16Bits && axis == 1)
-        {
-            kernel_info->resource_name[0] = "vsi_nn_kernel_shufflechannel_axis1";
-            kernel_info->kernel_index = 3;
-        }
-        else if ( is8Bits && axis == 1)
-        {
-            kernel_info->resource_name[0] = "vsi_nn_kernel_shufflechannel_axis1";
-            kernel_info->kernel_index = 4;
-        }
-    }
-#undef VSI_NN_TENSOR_WIDTH_MAX
-
-    return VSI_SUCCESS;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_border_t border;
-    vx_reference * args = NULL;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( self, params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    border.mode = VX_BORDER_REPLICATE;
-    border.constant_value.U32 = 0;
-    status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
-
-#endif
-
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -304,41 +51,6 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-#if(USE_OVXLIB)
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_info_t kernel_info;
-
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-
-    /* setup input/output shape */
-    _reshape_tensor( self, inputs, outputs);
-
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_shufflechannel";
-    kernel_info.type = vsi_nn_GetVXKernelTypeForShader();
-    kernel_info.kernel = vx_kernel_SHUFFLECHANNEL_list;
-    kernel_info.init_index = 1;
-
-    if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type))
-    {
-        vx_op_pre_compute(self, inputs, outputs, &kernel_info);
-    }
-
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name) free(kernel_info.resource_name);
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
-    return status;
-#else
     vsi_status status = VSI_FAILURE;
     vx_nn_reorg_params_ext2_t param;
     vsi_nn_tensor_t *block_size_tensor = NULL;
@@ -381,7 +93,6 @@ static vsi_status op_compute
     }
 
     return status;
-#endif
 } /* op_compute() */
 
 static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
index 47a5ac7..3a2aea3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
@@ -37,208 +37,11 @@
 #include "utils/vsi_nn_math.h"
 #include "client/vsi_nn_vxkernel.h"
 #include "libnnext/vx_lib_nnext.h"
+#include "vsi_nn_test.h"
 #include "utils/vsi_nn_constraint_check.h"
 
-#define _ARG_NUM            (2)
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
-
-extern vx_kernel_description_t * vx_kernel_SPACE2DEPTH_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_space2depth_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.space2depth);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, block_size[0] );
-    _SET_PARAM( 1, VX_TYPE_INT32, block_size[1] );
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_pre_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_kernel_info_t * kernel_info
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_type_e dataFormat = inputs[0]->attr.dtype.vx_type;
-    int8_t input_fixPointPos = 0;
-    int8_t output_fixPointPos = 0;
-    vx_bool dataTypeFlg = FALSE;
-    vsi_nn_tensor_attr_t attr[2];
-
-    memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
-    memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
-
-    status  = vsi_nn_vxGetTensorAttr(inputs[0]->t, &attr[0]);
-    status |= vsi_nn_vxGetTensorAttr(outputs[0]->t, &attr[1]);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-
-    input_fixPointPos  = attr[0].dtype.fl;
-    output_fixPointPos = attr[1].dtype.fl;
-
-    if(input_fixPointPos == output_fixPointPos)
-        dataTypeFlg = TRUE;
-
-    if ((dataFormat == VSI_NN_TYPE_INT16 && dataTypeFlg) || dataFormat == VSI_NN_TYPE_FLOAT16)
-    {
-        kernel_info->kernel_index = 2;
-    }
-    else
-    {
-        VSILOGE("Not support input or output data format!(PRELU)\n");
-        return VSI_FAILURE;
-    }
-
-    return VSI_SUCCESS;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_border_t border;
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    border.mode = VX_BORDER_REPLICATE;
-    border.constant_value.U32 = 0;
-    status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
 
 static vsi_status op_compute
     (
@@ -248,23 +51,26 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VSI_FAILURE;
-    int32_t size_x = self->nn_param.space2depth.block_size[0];
-    int32_t size_y = self->nn_param.space2depth.block_size[1];
-    if (size_x == size_y)
+    if (self->nn_param.space2depth.block_size[0] == self->nn_param.space2depth.block_size[1])
     {
         vx_nn_reorg_params_t param;
         vsi_nn_tensor_t *block_size_tensor = NULL;
+        vsi_nn_tensor_attr_t attr;
         memset(&param, 0, sizeof(vx_nn_reorg_params_t));
 
-        block_size_tensor = vsi_nn_VariableToTensor(self,
-            (uint8_t *)&self->nn_param.space2depth.block_size[0],
-            VSI_NN_TYPE_INT32);
-        if( NULL == block_size_tensor )
-        {
-            VSILOGE("Create block_size_tensor fail.(space2depth)");
-            return VSI_FAILURE;
-        }
-        self->nn_param.space2depth.local.block_size_tensor = block_size_tensor;
+        memset(&attr, 0, sizeof(attr));
+        attr.size[0] = 2;
+        attr.size[1] = 1;
+        attr.dim_num = 2;
+        attr.is_const = TRUE;
+        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+        attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        block_size_tensor = vsi_nn_CreateTensorFromData(
+            self->graph,
+            (uint8_t *)self->nn_param.space2depth.block_size,
+            &attr);
+        TEST_CHECK_PTR(block_size_tensor, final);
+
         param.block_size = REQUIRED_IO(block_size_tensor);
         param.type = VX_REORG_SPACE_TO_DEPTH;
 
@@ -274,46 +80,39 @@ static vsi_status op_compute
             sizeof(vx_nn_reorg_params_t),
             outputs[0]->t);
 
-        if( NULL != self->n )
+        if ( NULL != self->n )
         {
             status = VSI_SUCCESS;
         }
+final:
+        if (block_size_tensor) vsi_nn_ReleaseTensor(&block_size_tensor);
     }
     else
     {
-        vsi_nn_kernel_info_t kernel_info;
-        memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-        kernel_info.resource_num = 1;
-        kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-        kernel_info.resource_name[0] = "vsi_nn_kernel_space2depth";
-        //kernel_info.type = VX_KERNEL_TYPE_CPU;
-        kernel_info.type = vsi_nn_GetVXKernelTypeForShader();
-        kernel_info.kernel = vx_kernel_SPACE2DEPTH_list;
-        kernel_info.kernel_index = 1;
-        //kernel_info.init_index = 0;
-        kernel_info.init_index = 1;
-
-        if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type))
-        {
-            vx_op_pre_compute(self, inputs, outputs, &kernel_info);
-        }
-
-        self->n = vsi_nn_RegisterClientKernelAndNewNode(
-            self->graph, &kernel_info);
-        if (kernel_info.resource_name) free(kernel_info.resource_name);
-        if( NULL == self->n )
-        {
-            return VSI_FAILURE;
-        }
-        if (NULL != op_compute_list[kernel_info.init_index])
-        {
-            status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-        }
+        status = vsi_nn_internal_compute_node( self );
     }
 
     return status;
 } /* op_compute() */
 
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    if (self->nn_param.space2depth.block_size[0] != self->nn_param.space2depth.block_size[1])
+    {
+        return vsi_nn_internal_optimize_node(self, direction );
+    }
+    else
+    {
+        return VSI_SUCCESS;
+    }
+} /* op_optimize() */
+
 static vsi_bool op_check
     (
     vsi_nn_node_t * self,
@@ -321,7 +120,7 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    if(self->nn_param.space2depth.block_size[0] < 0
+    if (self->nn_param.space2depth.block_size[0] < 0
         || self->nn_param.space2depth.block_size[1] < 0)
     {
         VSILOGE("Block size can't be less than zero in space to depth");
@@ -341,7 +140,7 @@ static vsi_bool op_check
             /* HW 9.0 */
             IO_TYPE(D_BF16, D_BF16)
         END_IO_TYPE_DECL(SPACE2DEPTH)
-        if(!VALIDATE_OP_IO_TYPES(SPACE2DEPTH, self, inputs, self->input.num, outputs, self->output.num)) {
+        if (!VALIDATE_OP_IO_TYPES(SPACE2DEPTH, self, inputs, self->input.num, outputs, self->output.num)) {
             char* desc = generate_op_io_types_desc(inputs,
                     self->input.num, outputs, self->output.num);
             VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -353,6 +152,30 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
+static vsi_bool op_set_space2depth_internal
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_op_t  type_name
+    )
+{
+    vsi_bool retn = TRUE;
+    vsi_nn_internal_node_t* curr = NULL;
+
+    vsi_nn_internal_init_node_wksp( self );
+
+    curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+    curr->node->nn_param.space2depth_internal.block_size_x =
+                        self->nn_param.space2depth.block_size[0];
+    curr->node->nn_param.space2depth_internal.block_size_y =
+                        self->nn_param.space2depth.block_size[1];
+    curr->inputs[0]  = inputs[0];
+    curr->outputs[0] = outputs[0];
+    retn = vsi_nn_internal_setup_node(self, curr);
+
+    return retn;
+}
 
 static vsi_bool op_setup
     (
@@ -361,9 +184,10 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    vsi_bool ret = TRUE;
     uint32_t size_x = node->nn_param.space2depth.block_size[0];
     uint32_t size_y = node->nn_param.space2depth.block_size[1];
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
         outputs[0]->attr.size[0] = inputs[0]->attr.size[0] / size_x;
@@ -372,7 +196,12 @@ static vsi_bool op_setup
         outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
     }
 
-    return TRUE;
+    if (size_x != size_y)
+    {
+        ret = op_set_space2depth_internal(node, inputs, outputs, VSI_NN_OP_SPACE2DEPTH_INTERNAL);
+    }
+
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
@@ -380,11 +209,14 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-    if (self->nn_param.space2depth.local.block_size_tensor != NULL)
+    if (self->nn_param.space2depth.block_size[0] != self->nn_param.space2depth.block_size[1])
     {
-        vsi_nn_ReleaseTensor(&(self->nn_param.space2depth.local.block_size_tensor));
+        vsi_nn_internal_deinit_node_wksp(self);
+    }
+    else
+    {
+        vsi_nn_op_common_deinit(self);
     }
-    vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
 } /* op_deinit() */
@@ -401,9 +233,9 @@ DEF_OP_REG
     /* deinit     */ op_deinit,
     /* check      */ op_check,
     /* setup      */ op_setup,
-    /* optimize   */ NULL,
-    /* input_num  */ 1,
-    /* output_num */ 1
+    /* optimize   */ op_optimize,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
     );
 #ifdef __cplusplus
 }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c
new file mode 100644
index 0000000..5660eea
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c
@@ -0,0 +1,159 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_math.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    int32_t block_size_x = self->nn_param.space2depth_internal.block_size_x;
+    int32_t block_size_y = self->nn_param.space2depth_internal.block_size_y;
+
+    if ( NULL == self )
+    {
+        return VSI_FAILURE;
+    }
+
+    param =vsi_nn_kernel_param_create();
+
+    // Add params
+    vsi_nn_kernel_param_add_int32( param, "block_size_x", block_size_x );
+    vsi_nn_kernel_param_add_int32( param, "block_size_y", block_size_y );
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "space2depth_internal", inputs, 1, outputs, 1, param );
+
+    if ( self->n != NULL )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    uint32_t size_x = self->nn_param.space2depth_internal.block_size_x;
+    uint32_t size_y = self->nn_param.space2depth_internal.block_size_y;
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = inputs[0]->attr.size[0] * size_x;
+        outputs[0]->attr.size[1] = inputs[0]->attr.size[1] * size_y;
+        outputs[0]->attr.size[2] = inputs[0]->attr.size[2] / (size_x * size_y);
+        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(SPACE2DEPTH_INTERNAL, 1, 1)
+        IO_TYPE(D_F16,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_F32,  D_F32)
+        IO_TYPE(D_F32,  D_BF16)
+        IO_TYPE(D_BF16, D_F32)
+
+        /* HW 9.0 */
+        IO_TYPE(D_BF16, D_BF16)
+    END_IO_TYPE_DECL(SPACE2DEPTH_INTERNAL)
+    if (!VALIDATE_OP_IO_TYPES(SPACE2DEPTH_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+DEF_OP_REG
+    (
+    /* op_name    */ SPACE2DEPTH_INTERNAL,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
index 4283e40..39d32a5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
@@ -346,10 +346,10 @@ static vsi_bool op_check
 
     {
         BEGIN_IO_TYPE_DECL(TENSORSTACKCONCAT, 2, 1)
-            IO_TYPE(D_F16, D_F16, D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP)
+            IO_TYPE(D_F16, D_I32, D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP)
         END_IO_TYPE_DECL(TENSORSTACKCONCAT)
         if(!VALIDATE_OP_IO_TYPES(TENSORSTACKCONCAT, self, inputs, self->input.num, outputs, self->output.num)) {
             char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
index 906fb7c..5717fe3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
@@ -182,7 +182,7 @@ static vsi_bool op_setup
     vsi_nn_internal_setup_node( self, curr );
 
     slices = (uint32_t *)vsi_nn_internal_new_node_param(curr,
-        VSI_NN_MAX_DIM_NUM * sizeof(uint32_t));
+        tensor_num * sizeof(uint32_t));
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, tensor_num );
     curr->node->nn_param.split.axis = 1;
     curr->node->nn_param.split.slices = slices;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
new file mode 100644
index 0000000..c79c373
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
@@ -0,0 +1,253 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _upsamplescale_local_data_t {
+    int32_t placeholder;
+} upsamplescale_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+#define _EPSILON 1e-8
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    int32_t stride = self->nn_param.upsamplescale.stride;
+    float   scale  = self->nn_param.upsamplescale.scale;
+    vsi_nn_kernel_param_t * param = NULL;
+
+    if( NULL == self )
+    {
+        return VSI_FAILURE;
+    }
+
+    if (stride == 1 || vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    {
+        return vsi_nn_internal_compute_node( self );
+    }
+
+    param =vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "stride", stride );
+    vsi_nn_kernel_param_add_float32( param, "scale", scale );
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+        "upsamplescale",
+        inputs, 1,
+        outputs, 1, param );
+
+    vsi_nn_kernel_param_release( &param );
+
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(UPSAMPLESCALE, 1, 1)
+        IO_TYPE(D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,  D_U8)
+        IO_TYPE(D_F16,  D_I16)
+        IO_TYPE(D_F16,  D_I8)
+        IO_TYPE(D_F16,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_F16)
+        IO_TYPE(D_U8,   D_U8)
+        IO_TYPE(D_U8,   D_F16)
+        IO_TYPE(D_I8,   D_I8)
+        IO_TYPE(D_I8,   D_F16)
+        IO_TYPE(D_I16,  D_I16)
+        IO_TYPE(D_I16,  D_F16)
+    END_IO_TYPE_DECL(UPSAMPLESCALE)
+    if (!VALIDATE_OP_IO_TYPES(UPSAMPLESCALE, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    int32_t stride = self->nn_param.upsamplescale.stride;
+    float scale = self->nn_param.upsamplescale.scale;
+
+    if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    {
+        return vsi_nn_internal_optimize_node( self, direction );
+    }
+    else
+    {
+        return VSI_SUCCESS;
+    }
+}
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /* TODO: Add code to comput outputs' shape. */
+    int32_t stride = self->nn_param.upsamplescale.stride;
+    float scale = self->nn_param.upsamplescale.scale;
+    int32_t i = 0;
+    vsi_nn_internal_node_t* curr = NULL;
+
+    vsi_nn_internal_init_node_wksp(self);
+
+    if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    {
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
+        curr->inputs[0] = inputs[0];
+        curr->outputs[0] = outputs[0];
+
+        vsi_nn_internal_setup_node(self, curr);
+    }
+    else if (stride == 1)
+    {
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LINEAR, 0, 0);
+        curr->node->nn_param.linear.a = scale;
+        curr->node->nn_param.linear.b = 0;
+        curr->inputs[0] = inputs[0];
+        curr->outputs[0] = outputs[0];
+
+        vsi_nn_internal_setup_node(self, curr);
+    }
+    else if (vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    {
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESIZE, 0, 0);
+        curr->node->nn_param.resize.type = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR;
+        curr->node->nn_param.resize.align_corners = FALSE;
+        curr->node->nn_param.resize.half_pixel_centers = FALSE;
+        curr->node->nn_param.resize.size[0] = inputs[0]->attr.size[0] * stride;
+        curr->node->nn_param.resize.size[1] = inputs[0]->attr.size[1] * stride;
+        curr->inputs[0] = inputs[0];
+        curr->outputs[0] = outputs[0];
+
+        vsi_nn_internal_setup_node(self, curr);
+    }
+    else
+    {
+        outputs[0]->attr.size[0] = inputs[0]->attr.size[0] * stride;
+        outputs[0]->attr.size[1] = inputs[0]->attr.size[1] * stride;
+        for (i = 2; i < (int32_t)inputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    vsi_nn_internal_deinit_node_wksp( self );
+
+    status = vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ UPSAMPLESCALE,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ op_optimize,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index 518b099..16c1bff 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -418,6 +418,7 @@ static _op_param_gen_t s_op_gen[] =
     /* DECONVOLUTION1D */       NULL,
     /* INTERP */                NULL,
     /* RESIZE_1D */             NULL,
+    /* UPSAMPLESCALE */         NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index 392370a..a49d8f8 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -588,11 +588,6 @@ vsi_status vsi_nn_SetupGraph
     vsi_status status;
     vsi_nn_node_id_t *sorted_nodes;
     vsi_nn_node_id_t *nodes_list;
-    uint32_t num_of_graph_inputs;
-    vx_reference *graph_inputs = NULL;
-    uint32_t num_of_graph_outputs;
-    vx_reference *graph_outputs = NULL;
-    vsi_nn_tensor_t *tensor;
     vsi_bool dirty = FALSE;
 
     status = VSI_FAILURE;
@@ -670,54 +665,9 @@ vsi_status vsi_nn_SetupGraph
     status = vsi_nn_TrySetupCompleteSignalNode( graph );
     TEST_CHECK_STATUS( status, final );
 
-    /* Explicitly set graph inputs and outputs */
-    num_of_graph_inputs = graph->input.num;
-    graph_inputs = (vx_reference *)malloc( num_of_graph_inputs * sizeof( vx_reference ) );
-    for( i = 0; i < num_of_graph_inputs; i++ )
-    {
-        tensor = vsi_nn_GetTensor( graph, graph->input.tensors[i] );
-        if (tensor)
-        {
-            graph_inputs[i] = (vx_reference)( tensor->t );
-        }
-        else
-        {
-            graph_inputs[i] = NULL;
-        }
-    }
-    num_of_graph_outputs = graph->output.num;
-    if( graph->complete_signal.exists )
-    {
-        num_of_graph_outputs += 1;
-    }
-    graph_outputs = (vx_reference *)malloc( num_of_graph_outputs * sizeof( vx_reference ) );
-    for( i = 0; i < num_of_graph_outputs; i++ )
-    {
-        tensor = vsi_nn_GetTensor( graph, graph->output.tensors[i] );
-        if (tensor)
-        {
-            graph_outputs[i] = (vx_reference)( tensor->t );
-        }
-        else
-        {
-            graph_outputs[i] = NULL;
-        }
-    }
-    if( graph->complete_signal.exists )
-    {
-        graph_outputs[num_of_graph_outputs - 1] = \
-                (vx_reference)graph->complete_signal.tensor->t;
-    }
-    status = vxIdentifyGraphInputsAndOutputs( graph->g,
-        num_of_graph_inputs,
-        graph_inputs,
-        num_of_graph_outputs,
-        graph_outputs );
-
-    if( VSI_SUCCESS != status )
-    {
-        goto final;
-    }
+    /* Setup binary graph inputs and outputs. */
+    status = vsi_nn_setup_binary_graph_inputs_outputs( graph );
+    TEST_CHECK_STATUS( status, final );
 
 final:
     if( NULL != sorted_nodes )
@@ -728,14 +678,6 @@ final:
     {
         free( nodes_list );
     }
-    if ( NULL != graph_inputs)
-    {
-        free( graph_inputs );
-    }
-    if ( NULL != graph_outputs)
-    {
-        free( graph_outputs );
-    }
     return status;
 } /* vsi_nn_SetupGraph() */
 
@@ -1599,33 +1541,46 @@ void vsi_nn_DumpGraphToJson
             for(j = 0; j < node->input.num; j++)
             {
                 tio = &tensor_ref[node->input.tensors[j]];
-                if(tio->input.num > 0)
+                if(NULL == vsi_nn_GetTensor(graph, node->input.tensors[j]))
                 {
-                    table = tio->input.table;
-
-                    /* tensor only 1 input node */
-                    in_node = vsi_nn_GetNode(graph, table[0].node);
                     if(j == node->input.num - 1)
                     {
-                        fprintf(fp, "\"@uid_%u:out%u\" ", in_node->uid, table[0].index);
+                        fprintf(fp, "\"not used\" ");
                     }
                     else
                     {
-                        fprintf(fp, "\"@uid_%u:out%u\", ", in_node->uid, table[0].index);
+                        fprintf(fp, "\"not used\", ");
                     }
                 }
                 else
                 {
-                    if(j == node->input.num - 1)
+                    if(tio->input.num > 0)
                     {
-                        fprintf(fp, "\"datainput_%u:out0\" ", j);
+                        table = tio->input.table;
+
+                        /* tensor only 1 input node */
+                        in_node = vsi_nn_GetNode(graph, table[0].node);
+                        if(j == node->input.num - 1)
+                        {
+                            fprintf(fp, "\"@uid_%u:out%u\" ", in_node->uid, table[0].index);
+                        }
+                        else
+                        {
+                            fprintf(fp, "\"@uid_%u:out%u\", ", in_node->uid, table[0].index);
+                        }
                     }
                     else
                     {
-                        fprintf(fp, "\"datainput_%u:out0\", ", j);
+                        if(j == node->input.num - 1)
+                        {
+                            fprintf(fp, "\"datainput_%u:out0\" ", j);
+                        }
+                        else
+                        {
+                            fprintf(fp, "\"datainput_%u:out0\", ", j);
+                        }
                     }
                 }
-
             }
 
             /* dump input shape */
@@ -1633,14 +1588,14 @@ void vsi_nn_DumpGraphToJson
             for(j = 0; j < node->input.num; j++)
             {
                 tensor = vsi_nn_GetTensor(graph, node->input.tensors[j]);
-                if(vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num,
+                if(NULL != tensor && vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num,
                     shape, _SHAPE_BUF_SIZE, TRUE ) > 0)
                 {
                     fprintf(fp, "[%s ]", shape);
                 }
                 else
                 {
-                    fprintf(fp, "[ - ]");
+                    fprintf(fp, "[]");
                 }
                 if(j < node->input.num - 1)
                 {
@@ -1667,14 +1622,14 @@ void vsi_nn_DumpGraphToJson
             for(j = 0; j < node->output.num; j++)
             {
                 tensor = vsi_nn_GetTensor(graph, node->output.tensors[j]);
-                if(vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num,
+                if(NULL != tensor && vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num,
                     shape, _SHAPE_BUF_SIZE, TRUE ) > 0)
                 {
                     fprintf(fp, "[%s ]", shape);
                 }
                 else
                 {
-                    fprintf(fp, "[ - ]");
+                    fprintf(fp, "[]");
                 }
                 if(j < node->output.num - 1)
                 {
@@ -1762,6 +1717,124 @@ final:
     return status;
 } /* vsi_nn_TrySetupCompleteSignalNode() */
 
+
+/*
+ * Documented in vsi_nn_graph.h
+ */
+vsi_status vsi_nn_setup_binary_graph_inputs_outputs
+    (
+    vsi_nn_graph_t* graph
+    )
+{
+    uint32_t i,j;
+    vsi_status status;
+    uint32_t num_of_graph_inputs;
+    uint32_t num_of_graph_real_inputs;
+    vx_reference *graph_inputs = NULL;
+    uint32_t num_of_graph_outputs;
+    uint32_t num_of_graph_real_outputs;
+    vx_reference *graph_outputs = NULL;
+    vsi_nn_tensor_t *tensor;
+
+    num_of_graph_real_inputs = 0;
+    num_of_graph_real_outputs = 0;
+
+    /* Explicitly set graph inputs and outputs */
+    num_of_graph_inputs = graph->input.num;
+    for( i = 0; i < num_of_graph_inputs; i++ )
+    {
+        tensor = vsi_nn_GetTensor( graph, graph->input.tensors[i] );
+        if (tensor)
+        {
+            num_of_graph_real_inputs += 1;
+        }
+        else
+        {
+            ;//do nothing
+        }
+    }
+    graph_inputs = (vx_reference *)malloc( num_of_graph_real_inputs * sizeof( vx_reference ) );
+    for( i = 0, j = 0; i < num_of_graph_inputs; i++ )
+    {
+        tensor = vsi_nn_GetTensor( graph, graph->input.tensors[i] );
+        if (tensor)
+        {
+            if(j > num_of_graph_real_inputs -1)
+            {
+                status = VSI_FAILURE;
+                goto final;
+            }
+            graph_inputs[j++] = (vx_reference)( tensor->t );
+        }
+        else
+        {
+            ;//do nothing
+        }
+    }
+    num_of_graph_outputs = graph->output.num;
+    if( graph->complete_signal.exists )
+    {
+        num_of_graph_outputs += 1;
+    }
+    for( i = 0; i < num_of_graph_outputs; i++ )
+    {
+        tensor = vsi_nn_GetTensor( graph, graph->output.tensors[i] );
+        if (tensor)
+        {
+            num_of_graph_real_outputs += 1;
+        }
+        else
+        {
+            ;//do nothing
+        }
+    }
+    graph_outputs = (vx_reference *)malloc( num_of_graph_real_outputs * sizeof( vx_reference ) );
+    for( i = 0, j = 0; i < num_of_graph_outputs; i++ )
+    {
+        tensor = vsi_nn_GetTensor( graph, graph->output.tensors[i] );
+        if (tensor)
+        {
+            if(j > num_of_graph_real_outputs -1)
+            {
+                status = VSI_FAILURE;
+                goto final;
+            }
+            graph_outputs[j++] = (vx_reference)( tensor->t );
+        }
+        else
+        {
+            ;//do nothing
+        }
+    }
+    if( graph->complete_signal.exists )
+    {
+        graph_outputs[num_of_graph_real_outputs - 1] = \
+                (vx_reference)graph->complete_signal.tensor->t;
+    }
+
+    status = vxIdentifyGraphInputsAndOutputs( graph->g,
+        num_of_graph_real_inputs,
+        graph_inputs,
+        num_of_graph_real_outputs,
+        graph_outputs );
+
+    if( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
+final:
+    if ( NULL != graph_inputs)
+    {
+        free( graph_inputs );
+    }
+    if ( NULL != graph_outputs)
+    {
+        free( graph_outputs );
+    }
+    return status;
+} /* vsi_nn_setup_binary_graph_inputs_outputs() */
+
 vsi_status vsi_nn_SetupRNNConnections
     (
     vsi_nn_graph_t* graph,
diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
index 51083b1..c7c49a6 100644
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@@ -241,6 +241,31 @@ static void _set_preproc_node_input_attr
             input_attr->size[2] = 1;
         }
     }
+
+    if(*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR || *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY)
+    {
+        if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC)
+        {
+            input_attr->size[0] = input_size->w;
+            input_attr->size[1] = input_size->h;
+            input_attr->size[2] = input_size->c;
+        }
+    }
+
+    if(*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA)
+    {
+        if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC)
+        {
+            input_attr->size[0] = 4*input_attr->size[1];
+            input_attr->size[1] = input_attr->size[2];
+            input_attr->size[2] = 1;
+        }
+        else
+        {
+            input_attr->size[0] = 4*input_attr->size[0];
+            input_attr->size[2] = 1;
+        }
+    }
 } /*_set_preproc_node_input_attr() */
 
 static void _set_preproc_node_output_attr